def get_content(self, url=None): data = [] response = requests.get(url).text soup = BeautifulSoup(response, "html5lib") img = soup.select_one( '.read-page--photo-gallery--item__picture > img')['data-src'] contents = soup.select('.article-content-body__item-content > p') for i in range(len(contents)): if contents[i].text.strip() != '' and contents[i].text.strip()[:1] != '*' \ and contents[i].text.strip()[:8] != 'Reporter' and contents[i].text.strip()[:14] != 'Saksikan video'\ and contents[i].text.strip()[:1] != '(' and contents[i].text.strip()[:14] != 'Saksikan Video' \ and contents[i].text.strip()[:2] != ' (' and contents[i].text.strip()[:7] != 'Sumber:': data.append(contents[i].text.strip() + '\n\n') con = ''.join(data) con = preprocess_text(con, fix_unicode=True) con = self.ner_text(con) con2 = ''.join(data) con2 = self.ner_text(con2) con2 = con2.split('\n\n') data_json = {"img": img, "content": con, "content_html": con2} return data_json
def get_content(self, url=None): data = [] response = requests.get(url).text soup = BeautifulSoup(response, "html5lib") sub_category = soup.select('.breadcrumbs > li')[2].text img = soup.select_one('figure > a')['href'] contents = soup.select('#isi > p') for content in contents: if content.text.strip( )[:10] != 'Baca juga:' and content.text.strip()[:5] != 'Baca:': data.append(content.text.strip() + '\n\n') con = ''.join(data) con = preprocess_text(con, fix_unicode=True) con = self.ner_text(con) con2 = ''.join(data) con2 = self.ner_text(con2) con2 = con2.split('\n\n') data_json = { "sub_category": sub_category, "img": img, "content": con, "content_html": con2 } return data_json
def getContent(self, url=None): iData = [] iResponse = requests.get(url).text iSoup = BeautifulSoup(iResponse, "html5lib") subCategory = iSoup.select('.breadcrumbs > li')[2].text img = iSoup.select_one('figure > a')['href'] iContents = iSoup.select('#isi > p') for content in iContents: if content.text.strip( )[:10] != 'Baca juga:' and content.text.strip()[:5] != 'Baca:': iData.append(content.text.strip() + '\n\n') ordinaryContent = ''.join(iData) ordinaryContent = preprocess_text(ordinaryContent, fix_unicode=True) ordinaryContent = self.nerText(ordinaryContent) htmlContent = ''.join(iData) htmlContent = self.nerText(htmlContent) htmlContent = htmlContent.split('\n\n') iJson = { "subCategory": subCategory, "img": img, "content": ordinaryContent, "contentHTML": htmlContent } return iJson
def getContent(self, url=None): iData = [] iResponse = requests.get(url).text iSoup = BeautifulSoup(iResponse, "html5lib") img = iSoup.select_one( '.read-page--photo-gallery--item__picture > img')['data-src'] contents = iSoup.select('.article-content-body__item-content > p') for i in range(len(contents)): if contents[i].text.strip() != '' and contents[i].text.strip()[:1] != '*' \ and contents[i].text.strip()[:8] != 'Reporter' and contents[i].text.strip()[:14] != 'Saksikan video'\ and contents[i].text.strip()[:1] != '(' and contents[i].text.strip()[:14] != 'Saksikan Video' \ and contents[i].text.strip()[:2] != ' (' and contents[i].text.strip()[:7] != 'Sumber:': iData.append(contents[i].text.strip() + '\n\n') ordinaryContent = ''.join(iData) ordinaryContent = preprocess_text(ordinaryContent, fix_unicode=True) ordinaryContent = self.nerText(ordinaryContent) htmlContent = ''.join(iData) htmlContent = self.nerText(htmlContent) htmlContent = htmlContent.split('\n\n') iJson = { "img": img, "content": ordinaryContent, "contentHTML": htmlContent } return iJson
def getContent(self, url=None): iData = [] iResponse = requests.get(url).text iSoup = BeautifulSoup(iResponse, "html5lib") contents = iSoup.select_one('.photo > img') contents2 = iSoup.select('.read__content > p') img = contents['data-src'] for i in range(len(contents2)): if contents2[i].text != '': if (contents2[i].text[:9] != 'Baca juga' and contents2[i].text[:5] != 'Baca:') \ and (contents2[i].text[:15] != 'We are thrilled') and (contents2[i].text[:6] != 'Flinke') \ and (contents2[i].text[:18] != 'Baca selengkapnya:') and (contents2[i].text[:25]) != 'Baca berita selengkapnya:' \ and (contents2[i].text[:7]) != 'Sumber:': iData.append(contents2[i].text + '\n\n') ordinaryContent = ''.join(iData) ordinaryContent = preprocess_text(ordinaryContent, fix_unicode=True) ordinaryContent = self.nerText(ordinaryContent) htmlContent = ''.join(iData) htmlContent = self.nerText(htmlContent) htmlContent = htmlContent.split('\n\n') iJson = { "img": img, "content": ordinaryContent, "contentHTML": htmlContent } return iJson
def get_content(self, url=None): response = requests.get(url).text soup = BeautifulSoup(response, "html5lib") contents = soup.select_one('.photo > img') contents2 = soup.select('.read__content > p') temp_img = contents['src'] data = [] for i in range(len(contents2)): if contents2[i].text != '': if (contents2[i].text[:9] != 'Baca juga' and contents2[i].text[:5] != 'Baca:') \ and (contents2[i].text[:15] != 'We are thrilled') and (contents2[i].text[:6] != 'Flinke') \ and (contents2[i].text[:18] != 'Baca selengkapnya:') and (contents2[i].text[:25]) != 'Baca berita selengkapnya:' \ and (contents2[i].text[:7]) != 'Sumber:': data.append(contents2[i].text + '\n\n') con = ''.join(data) con = preprocess_text(con, fix_unicode=True) con = self.ner_text(con) con2 = ''.join(data) con2 = self.ner_text(con2) con2 = con2.split('\n\n') data_json = {"img": temp_img, "content": con, "content_html": con2} return data_json
def sym(text:str) -> str: """generalize symbols such as urls, emails, phone numbers and filepaths to generic tokens.""" text = preprocess_text(text, no_emails=True, no_phone_numbers=True, no_accents=True) # generalize file paths file_path_regex = r'C:(\\\\\S+){2,}|(/\S+){2,}|[Cc]:\\\w+(\\[0-9a-zA-Z_\-]+)+' text = re.sub(file_path_regex, ' xxxfilepath ', text) # generalize @ mentions at_mention_regex = r'\W@\w+' text = re.sub(at_mention_regex, ' xxxatmention ', text) # get date/time text = re.sub(r'\d+[-/]\d+[-/]\d+(.{0,2})?(\d+:\d+:\d+)', ' xxxdatetm ', text) # strings that have >=4 dots w/o any whitespace in between text = re.sub(r'(\S+\.\S+){4,}', 'xxunk', text) # things that look like IP addresses text = re.sub(r'\d+\.\d+.\d+\.\d+', 'xxunk', text) # long strings or numbers text = re.sub(r'\S{30,}|\d{6,}', 'xxunk', text) # generalize json json_regex = r'\{(?:[^{}]|(?R))*\}' text = regex.sub(json_regex, ' xxxjson ', text) return text
def test_preprocess_text(): text = ( "Well… That's a long story. " "Hello, world! Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. " "Y'all can't believe you're not who they've said I'll become, but shouldn't. " "I learned everything I know from www.stackoverflow.com and http://wikipedia.org/ and Mom. " "I can be reached at [email protected] through next Friday. " "I can be reached at 555-123-4567 through next Friday. " "I owe $1,000.99 to 123 people for 2 +1 reasons. " "El niño se asustó -- qué miedo!") proc_text = ( "Well... That's a long story. " "Hello, world! Hello... world?\nHello:\nWorld. " "You all can not believe you are not who they have said I will become, but should not. " "I learned everything I know from *URL* and *URL* and Mom. " "I can be reached at *EMAIL* through next Friday. " "I can be reached at *PHONE* through next Friday. " "I owe USD*NUMBER* to *NUMBER* people for *NUMBER* *NUMBER* reasons. " "El nino se asusto -- que miedo!") assert preprocess.preprocess_text(text, normalized_unicode=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_contractions=True, no_accents=True) == proc_text
def cleanContent(self, iData=None): for i in tqdm(range(len(iData)), desc='Clean Content'): text_stopword = [] iData[i]['cleanContent'] = preprocess_text( iData[i]['content'], lowercase=True, fix_unicode=True, no_punct=True, no_numbers=True, no_urls=True, no_currency_symbols=True, no_phone_numbers=True, no_emails=True) clean_content = iData[i]['cleanContent'].split() [ text_stopword.append(cc) for cc in clean_content if cc not in stopwords ] case_folding = ' '.join(text_stopword) # stemming = stemmer.stem(case_folding) iData[i]['cleanContent'] = case_folding return iData
def process_questions(t): text = tap.preprocess_text(t, fix_unicode=True, lowercase=True, transliterate=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=True, no_accents=True) return set(word_tokenize(text))
def cleaning_text(text): text = preprocess_text(text, no_numbers=True, fix_unicode=True, lowercase=True, no_punct=True) text = " ".join(text.replace("number", "").split()) return text
def clean_text(text): text = text.replace('/n', ' ')).replace('.com', ' ').replace('.org', ' ').replace('.net', ' ') text = strip_html(text) # Remove contractions, if any: text = preprocess_text(text, fix_unicode=True, no_accents=True, no_contractions=True, lowercase=True, no_punct=True, no_currency_symbols=True), replace_with=' ') text = replace_urls(text, replace_with='') text = replace_numbers(text, replace_with='') return text
def clean_sentence(sentences): c = sentences.replace('-', ' ') # people use to concatinate words c = normalize_whitespace(c) c = preprocess_text(c, lowercase=True, no_numbers=True, no_punct=True, no_contractions=True) return c
def cleanContent(self, iData=None): for i in tqdm(range(len(iData)), desc='Clean Content'): text_stopword = [] iData[i]['cleanContent'] = preprocess_text(iData[i]['content'], lowercase=True, fix_unicode=True,no_punct=True,no_numbers=True) clean_content = iData[i]['cleanContent'].split() [text_stopword.append(cc) for cc in clean_content if cc not in stopwords] iData[i]['cleanContent'] = ' '.join(text_stopword) return iData
def clean(text, lower=True, **kwargs): text = clean_quotes(text) text = preprocess_text(text, fix_unicode=True, lowercase=lower, no_urls=True, no_emails=True, transliterate=True, no_numbers=True, no_phone_numbers=True) return text
def preprocess(text, fix_unicode=True, normalize_white_space = False, lowercase=False, transliterate=False, no_urls=False, no_emails=False, no_phone_numbers=False, no_numbers=False, no_currency_symbols=False, no_punct=False, no_contractions=False, no_accents=False): if normalize_white_space: text = pp.normalize_whitespace(text) text = pp.preprocess_text(text, fix_unicode, lowercase, transliterate, no_urls, no_emails, no_phone_numbers, no_numbers, no_currency_symbols, no_punct, no_contractions, no_accents) return text
def preprocess_sentence(sent): # TODO check language? s = preprocess.normalize_whitespace(sent) return preprocess.preprocess_text(s, lowercase=True, transliterate=True, no_urls=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_contractions=True, no_accents=True)
def stepOne(self, content=None): result = preprocess_text(content, fix_unicode=True, lowercase=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=True) return result
def getCategory(self, iData=None): model = joblib.load('modelMNB') for data in iData: clean = preprocess_text(data['cleanContent'], lowercase=True) result = model.predict([clean]) result = result[0] data['category'] = result return iData
def custom_preprocess(self, text): text = self.replace_bank_names(text) text = preprocess_text(text, fix_unicode=True, lowercase=False, no_urls=True, no_emails=True, no_phone_numbers=True, no_punct=False, no_numbers=False) text = self.replace_characters_to_space(text) return text
def process(content, env, **settings): for doc in content: try: text = doc['text'] text = preprocess_text(text, **settings) except Exception: logger.exception( "Textacy Processor: got an error in extracting content: %r", doc) continue yield set_text(doc, text)
def preprocess_text_by_config(text, textacy_defs): return preprocess_text( text, fix_unicode=textacy_defs['fix_unicode'], lowercase=textacy_defs['lowercase'], transliterate=textacy_defs['transliterate'], no_urls=textacy_defs['no_urls'], no_emails=textacy_defs['no_emails'], no_phone_numbers=textacy_defs['no_phone_numbers'], no_numbers=textacy_defs['no_numbers'], no_currency_symbols=textacy_defs['no_currency_symbols'], no_punct=textacy_defs['no_punct'], no_contractions=textacy_defs['no_contractions'], no_accents=textacy_defs['no_accents'])
def from_feed(self, url): fdict = fp.parse(url) for entry in fdict.entries: # Each entry may have multiple pieces of content. Here they're just concatenated. body = "" for c in entry.content: body += " " + c.value # Preprocessing body = pre.preprocess_text(body, no_urls=True, no_emails=True, no_phone_numbers=True) metadata = {'title': entry.title, 'author': entry.author, 'date_updated': entry.updated, 'publication_title': fdict.feed.title} self.add_text(body, metadata = metadata)
def textacy_cleaner(text: str) -> str: if isinstance(text, numbers.Number) and numpy.isnan(text): logging.warning("Received nan instead of str") return "nan" return preprocess_text(text, fix_unicode=False, lowercase=True, transliterate=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=False, no_accents=True)
def textacy_cleaner(text: str) -> str: """ Defines the default function for cleaning text. This function operates over a list. """ return preprocess_text(text, fix_unicode=True, lowercase=True, transliterate=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=False, no_accents=True)
def str_clean_up(x, nlp): # replace the "..." by " " # raw example: "to...find...this...purpose...\nof...a 'voice' hearer...is...to...go.." out = x.replace("...", " ") out = out.replace("\n", " ") # replace all punctuations # out = out.replace('.', '') # out = out.replace(',', '') # out = out.replace('"', '') # out = out.replace('?', '') # out = out.replace('!', '') # replace all slashes out = out.replace('\\', '') out = out.replace('/', '') # fix unicode, currency, contraction, accents out = preprocess_text(out, fix_unicode=True, transliterate=True, no_currency_symbols=True, no_contractions=True, no_accents=True, no_urls=True, no_emails=True) # replace http and emails out_doc = nlp(out) text = [] for token in out_doc: if token.like_url or token.like_email: pass else: text.append(token.text) out = " ".join(text) # replace other characters import re out = re.sub("[^A-Za-z0-9 ?'.:;!]+", "", out) return out
def preprocessText(self, strtxt, lang='en', ner=False): self.utilclass = UtilityClass() posttxt = str(strtxt) ''' if ner: posttxt = self.processNER(posttxt, lang=lang) ''' posttxt = preprocess_text(posttxt, fix_unicode=True, lowercase=False, transliterate=False, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=False, no_contractions=False, no_accents=False) return posttxt
def textacy_cleaner(text: str) -> str: if isinstance(text, (int, float, complex)): # workaround module not found error if inside model import numpy, logging if numpy.isnan(text): logging.warning("Received nan instead of str") return "nan" from textacy.preprocess import preprocess_text return preprocess_text(text, fix_unicode=False, lowercase=True, transliterate=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=False, no_accents=True)
def preprocess_f(text, fix_unicode=True, lowercase=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=True, no_accents=True): """Preprocess text.""" clean_text = preprocess_text(text, fix_unicode=fix_unicode, lowercase=lowercase, no_urls=no_urls, no_emails=no_emails, no_phone_numbers=no_phone_numbers, no_numbers=no_numbers, no_currency_symbols=no_currency_symbols, no_punct=no_punct, no_accents=no_accents) return clean_text
def preprocess_text_string(text): """Preprocesses text for feature extraction. Preprocessing tasks are as follows: - whitespace normalization - fixing broken unicode via ftfy - converting text to lowercase - replacing url strings with 'url' - replacing phone number strings with 'phone' - replacing currency symbols with their standard 3-letter abbreviations - stripping punctuation - replacing contractions with their unshortened forms - lemmatizing words Parameters ---------- text : str The input text to be preprocessed. Returns ------- preprocessed : str The preprocessed output text. """ text = preprocess_text(text, fix_unicode=True, lowercase=True, no_urls=True, no_phone_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=True) doc = Doc(text, lang='en') lemmatized_tokens = doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True, normalize='lemma') return ' '.join(lemmatized_tokens)
def tokenizer(sentences): y = [] if type(sentences) == str: sentences = [sentences] for comment in sentences: comment = my_preprocess(comment) txt = preprocess.normalize_whitespace(comment) txt = preprocess.preprocess_text(txt, fix_unicode=True, lowercase=True, transliterate=True, no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=True, no_accents=True) y.append(u''.join(txt)) return y