def filter_wiki(raw): """ Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode or utf-8 encoded string. """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) text = utils.decode_htmlentities(utils.to_unicode(raw, "utf8", errors="ignore")) text = utils.decode_htmlentities(text) # ' ' --> '\xa0' return remove_markup(text)
def filter_wiki(raw): """ Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode or utf-8 encoded string. """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) text = utils.decode_htmlentities( utils.to_unicode(raw, 'utf8', errors='ignore')) text = utils.decode_htmlentities(text) # ' ' --> '\xa0' return remove_markup(text)
def preprocess_text(tweet): """ Function to process an aggregated user profile. This does the following: 1. Decode html entities. eg. "AT&T" will become "AT&T" 2. Deaccent 3. Remove links. 4. Remove any user mentions (@name). 5. Lemmatize and remove stopwords. Parameters: ---------- text : String. If train_texts is a list of tweets, ' '.join and pass Returns: ------- text : preprocessed (tokenized) tweet. """ tweet = decode_htmlentities(tweet) tweet = deaccent(tweet) tweet = tweet.encode('ascii', 'ignore') # To prevent UnicodeDecodeErrors later on tweet = re.sub(r'http\S+', '', str(tweet)) # Step 3 tweet = re.sub(r'@\w+', '', str(tweet)) # Step 4 tweet = tweet.split() tweet = lemmatize(' '.join(tweet), re.compile('(NN)'), stopwords=stopwords.words('english'), min_length=3, max_length=15) tweet = [word.split('/')[0] for word in tweet] return tweet
def filter_wiki(raw, promote_remaining=True, simplify_links=True): """Filter out wiki markup from `raw`, leaving only text. Parameters ---------- raw : str Unicode or utf-8 encoded string. promote_remaining : bool Whether uncaught markup should be promoted to plain text. simplify_links : bool Whether links should be simplified keeping only their description text. Returns ------- str `raw` without markup. """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) text = utils.to_unicode(raw, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # ' ' --> '\xa0' # the cleaning process cleaned_text = remove_markup(text, promote_remaining, simplify_links) return cleaned_text
def preprocessing(revisions): print "preprocessing called..." corpus = [] stop_words = set(stopwords.words('english')) punctuation = re.compile(r'[-.?!/,":;*()=%$\'/\\&_\[\]}{<>#^\-+@|0-9]') pattern = re.compile(r'\s\s+') punctuation = re.compile(r'[^a-zA-Z]+') third_layer = re.compile(r'(^| ).( |$)') # # start_time = time.time() count = 0 for each_revision in revisions: try: count = count +1 text = utils.decode_htmlentities(utils.to_unicode(each_revision, 'utf8', errors='ignore')) # text = utils.decode_htmlentities(text) punc_text = punctuation.sub(" ", remove_markup(text)) text = re.sub(pattern, ' ', punc_text) #to remove extra white spaces. third_text = re.sub(third_layer, ' ', text) #to remove all other characters except for text tokenize = nltk.word_tokenize(third_text) lowered_text = [w.lower().strip() for w in tokenize] text_without_stopwords = [w for w in lowered_text if not w in stop_words] # # print text_without_stopwords corpus.append(text_without_stopwords) if count%100 == 0: print "Preprocessed {} revision".format(count) except: sys.exc_info()[0] next print len(corpus) return corpus
def filterWiki(raw): """ Filter out wiki mark-up from utf8 string `raw`, leaving only text. """ # the parsing the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) text = utils.decode_htmlentities(unicode(raw, 'utf8', 'ignore')) text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' text = re.sub(RE_P2, "", text) # remove the last list (=languages) # the wiki markup is recursive (markup inside markup etc) # instead of writing a recursive grammar, here we deal with that by removing # markup in a loop, starting with inner-most expressions and working outwards, # as long as something changes. iters = 0 while True: old, iters = text, iters + 1 text = re.sub(RE_P0, "", text) # remove comments text = re.sub(RE_P1, '', text) # remove footnotes text = re.sub(RE_P9, "", text) # remove outside links text = re.sub(RE_P10, "", text) # remove math content text = re.sub(RE_P11, "", text) # remove all remaining tags # remove templates (no recursion) text = re.sub(RE_P3, '', text) text = re.sub(RE_P4, '', text) text = re.sub(RE_P5, '\\3', text) # remove urls, keep description text = re.sub(RE_P7, '\n\\3', text) # simplify images, keep description only text = re.sub(RE_P8, '\n\\3', text) # simplify files, keep description only text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only # remove table markup text = text.replace('||', '\n|') # each table cell on a separate line text = re.sub(RE_P12, '\n', text) # remove formatting lines text = re.sub(RE_P13, '\n\\3', text) # leave only cell content # remove empty mark-up text = text.replace('[]', '') if old == text or iters > 2: # stop if nothing changed between two iterations or after a fixed number of iterations break # the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists' # TODO is this really desirable? text = text.replace('[', '').replace( ']', '') # promote all remaining markup to plain text return text
def get_article_claims(args): text, lemmatize, title, pageid, set_citation, quote_identifiers = args text = utils.to_unicode(text, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) plaintext = get_plain_with_cnmarks(text, set_citation, quote_identifiers) claims = ArticleClaims(pageid, title, plaintext) claims.from_text() return claims
def filter_wiki(raw): """ Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode or utf-8 encoded string. """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) text = utils.decode_htmlentities(utils.to_unicode(raw, 'utf8', errors='ignore')) text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' text = re.sub(RE_P2, "", text) # remove the last list (=languages) # the wiki markup is recursive (markup inside markup etc) # instead of writing a recursive grammar, here we deal with that by removing # markup in a loop, starting with inner-most expressions and working outwards, # for as long as something changes. iters = 0 while True: old, iters = text, iters + 1 text = re.sub(RE_P0, "", text) # remove comments text = re.sub(RE_P1, '', text) # remove footnotes text = re.sub(RE_P9, "", text) # remove outside links text = re.sub(RE_P10, "", text) # remove math content text = re.sub(RE_P11, "", text) # remove all remaining tags # remove templates (no recursion) text = re.sub(RE_P3, '', text) text = re.sub(RE_P4, '', text) text = re.sub(RE_P5, '\\3', text) # remove urls, keep description text = re.sub(RE_P7, '\n\\3', text) # simplify images, keep description only text = re.sub(RE_P8, '\n\\3', text) # simplify files, keep description only text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only # remove table markup text = text.replace('||', '\n|') # each table cell on a separate line text = re.sub(RE_P12, '\n', text) # remove formatting lines text = re.sub(RE_P13, '\n\\3', text) # leave only cell content # remove empty mark-up text = text.replace('[]', '') if old == text or iters > 2: # stop if nothing changed between two iterations or after a fixed number of iterations break # the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists' # TODO is this really desirable? text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text return text
def iterate_wiki(input_path): lemmatize = utils.has_pattern() filter_namespaces = ('0',) texts = ((text, lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(input_path), filter_namespaces)) for article in texts: text, lemmatize, title, pageid = article text = utils.to_unicode(text, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' text = remove_markup(text) tokens = get_all_words(text) if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue yield title, tokens
def filter_wiki(raw, promote_remaining=True, simplify_links=True): """Filter out wiki markup from `raw`, leaving only text. Parameters ---------- raw : str Unicode or utf-8 encoded string. promote_remaining : bool Whether uncaught markup should be promoted to plain text. simplify_links : bool Whether links should be simplified keeping only their description text. Returns ------- str `raw` without markup. """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) text = utils.to_unicode(raw, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' return remove_markup(text, promote_remaining, simplify_links)
def preprocessing(revisions): print "preprocessing called..." corpus = [] stop_words = set(stopwords.words('english')) punctuation = re.compile(r'[-.?!/,":;*()=%$\'/\\&_\[\]}{<>#^\-+@|0-9]') pattern = re.compile(r'\s\s+') punctuation = re.compile(r'[^a-zA-Z]+') third_layer = re.compile(r'(^| ).( |$)') # # start_time = time.time() count = 0 for each_revision in revisions: try: count = count + 1 text = utils.decode_htmlentities( utils.to_unicode(each_revision, 'utf8', errors='ignore')) # text = utils.decode_htmlentities(text) punc_text = punctuation.sub(" ", remove_markup(text)) text = re.sub(pattern, ' ', punc_text) #to remove extra white spaces. third_text = re.sub( third_layer, ' ', text) #to remove all other characters except for text tokenize = nltk.word_tokenize(third_text) lowered_text = [w.lower().strip() for w in tokenize] text_without_stopwords = [ w for w in lowered_text if not w in stop_words ] # # print text_without_stopwords corpus.append(text_without_stopwords) if count % 100 == 0: print "Preprocessed {} revision".format(count) except: sys.exc_info()[0] next print len(corpus) return corpus
def get_raw_text_and_links_from_markup(raw): text = utils.to_unicode(raw, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' return __remove_markup(text)
def filter_wiki(raw): text = utils.to_unicode(raw, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' return remove_markup(text)#, promote_remaining=False, simplify_links=True)
def process_enwikinews(s,verbose=True): # extract text (vs pageid and title) ss = RE_P17.split(s) if len(ss)!=3 or not ss[2]: if verbose: print('There is no text in this acticle.') return None # [:-1] means remove the last '\n' in RE_P17 pageid, _, title = ss[1][:-1].partition('\n') if any(title.lower().startswith(x) for x in IGNORED_TITLES): if verbose: print('This article does not have a normal title.') return None text = ss[2].strip() if any(text.lower().startswith(x) for x in IGNORED_TEXTS): if verbose: print('This article does not have a normal text.') return None # extract main content of text (i.e. remove its tail) text = text.partition('{{haveyoursay}}') if not text[1]: text = text[0].partition('==') text = text[0].strip() # from filter_wiki() text = utils.to_unicode(text,'utf8',errors='ignore') text = utils.decode_htmlentities(text) # from remove_markup() text = RE_P2.sub('',text) # remove the last list (=languages) ## template-related (for future: ...=...) ### {{Brazil}} text = RE_template_1.sub(r'\1\n',text) ### {{date|November 13, 2004}} text = RE_template_2.sub(_repl_or,text) ## file[/image]-related text = RE_P15.sub('',text) ## the rest iters = 0 while True: old, iters = text, iters + 1 text = RE_P0.sub('',text) # remove comments (pageid = 1471698) text = RE_P1.sub('',text) # remove footnotes text = RE_P9.sub('',text) # remove outside links text = RE_P10.sub('MATH', text) # remove math content text = RE_P11.sub('', text) # remove all remaining tags text = RE_P14_edited.sub('', text) # remove categories text = RE_P5.sub(r'\3', text) # remove urls, keep description text = RE_P6.sub(r'\2', text) # simplify links, keep description only # remove table markup text = text.replace('||', '\n|') # each table cell on a separate line text = RE_P12.sub('\n', text) # remove formatting lines text = RE_P13.sub(r'\n\3', text) # leave only cell content # remove empty mark-up text = text.replace('[]', '') # stop if nothing changed between two iterations or after a fixed number of iterations if old == text or iters > 2: break text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text # {{byline|date=November 14, 2004|location=RAMALLAH}} text = RE_template_3.sub(_repl_byline,text) # cleaning text = remove_template(text) # pageid = 113289 text = re.sub('(\n)+','\n',text).strip() if not text: if verbose: print('This article does not have a normal text.') return None return pageid, title, text
def test_decode_entities(self): # create a string that fails to decode with unichr on narrow python builds body = u'It’s the Year of the Horse. YES VIN DIESEL 🙌 💯' expected = u'It\x92s the Year of the Horse. YES VIN DIESEL \U0001f64c \U0001f4af' self.assertEqual(utils.decode_htmlentities(body), expected)
argparser = argparse.ArgumentParser(description='Wikipedia Dump Extractor') argparser.add_argument('-input_path', type=str, required=True, help='Path to the raw Wikipedia dump') argparser.add_argument('-output_path', type=str, required=True, help='Write path for extracted text content') return argparser.parse_args() if __name__ == '__main__': arguments = config_argparser() program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) lemmatize = utils.has_pattern() filter_namespaces = ('0',) texts = ((text, lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(arguments.input_path), filter_namespaces)) parsed_article_counter = 0 space = u' ' output = codecs.open(arguments.output_path, 'w', 'utf-8') for article in texts: text, lemmatize, title, pageid = article text = utils.to_unicode(text, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' text = remove_markup(text) tokens = get_all_words(text) if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue output.write("{}\n".format(space.join(tokens) + "\n")) parsed_article_counter += 1 print('Parsed articles: {}', parsed_article_counter)
def test_decode_entities(self): # create a string that fails to decode with unichr on narrow python builds body = u'It’s the Year of the Horse. YES VIN DIESEL 🙌 💯' expected = u'It\x92s the Year of the Horse. YES VIN DIESEL \U0001f64c \U0001f4af' self.assertEquals(utils.decode_htmlentities(body), expected)
def __decode_htmlentities(self, text): return utils.decode_htmlentities(text)