示例#1
0
def filter_wiki(raw):
    """
    Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
    or utf-8 encoded string.
    """
    # parsing of the wiki markup is not perfect, but sufficient for our purposes
    # contributions to improving this code are welcome :)
    text = utils.decode_htmlentities(utils.to_unicode(raw, "utf8", errors="ignore"))
    text = utils.decode_htmlentities(text)  # ' ' --> '\xa0'
    return remove_markup(text)
示例#2
0
def filter_wiki(raw):
    """
    Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
    or utf-8 encoded string.
    """
    # parsing of the wiki markup is not perfect, but sufficient for our purposes
    # contributions to improving this code are welcome :)
    text = utils.decode_htmlentities(
        utils.to_unicode(raw, 'utf8', errors='ignore'))
    text = utils.decode_htmlentities(text)  # ' ' --> '\xa0'
    return remove_markup(text)
示例#3
0
def preprocess_text(tweet):
    """
    Function to process an aggregated user profile. This does the following:
    1. Decode html entities. eg. "AT&T" will become "AT&T"
    2. Deaccent
    3. Remove links.
    4. Remove any user mentions (@name).
    5. Lemmatize and remove stopwords.
    
    Parameters:
    ----------
    text : String. If train_texts is a list of tweets, ' '.join and pass
    
    Returns:
    -------
    text : preprocessed (tokenized) tweet.
    """
    tweet = decode_htmlentities(tweet)
    tweet = deaccent(tweet)
    tweet = tweet.encode('ascii',
                         'ignore')  # To prevent UnicodeDecodeErrors later on
    tweet = re.sub(r'http\S+', '', str(tweet))  # Step 3
    tweet = re.sub(r'@\w+', '', str(tweet))  # Step 4
    tweet = tweet.split()
    tweet = lemmatize(' '.join(tweet),
                      re.compile('(NN)'),
                      stopwords=stopwords.words('english'),
                      min_length=3,
                      max_length=15)
    tweet = [word.split('/')[0] for word in tweet]
    return tweet
示例#4
0
def filter_wiki(raw, promote_remaining=True, simplify_links=True):
    """Filter out wiki markup from `raw`, leaving only text.

    Parameters
    ----------
    raw : str
        Unicode or utf-8 encoded string.
    promote_remaining : bool
        Whether uncaught markup should be promoted to plain text.
    simplify_links : bool
        Whether links should be simplified keeping only their description text.

    Returns
    -------
    str
        `raw` without markup.

    """
    # parsing of the wiki markup is not perfect, but sufficient for our purposes
    # contributions to improving this code are welcome :)
    text = utils.to_unicode(raw, 'utf8', errors='ignore')
    text = utils.decode_htmlentities(text)  # ' ' --> '\xa0'
    # the cleaning process
    cleaned_text = remove_markup(text, promote_remaining, simplify_links)
    return cleaned_text
def preprocessing(revisions):
	print "preprocessing called..."
	corpus = []
	stop_words  = set(stopwords.words('english'))
	punctuation = re.compile(r'[-.?!/,":;*()=%$\'/\\&_\[\]}{<>#^\-+@|0-9]')
	pattern = re.compile(r'\s\s+')
	punctuation = re.compile(r'[^a-zA-Z]+')
	third_layer = re.compile(r'(^| ).( |$)')
	# # start_time = time.time()
	count = 0
	for each_revision in revisions:
		try:
			count = count +1
			text = utils.decode_htmlentities(utils.to_unicode(each_revision, 'utf8', errors='ignore'))
    		# text = utils.decode_htmlentities(text)
			punc_text = punctuation.sub(" ", remove_markup(text))
			text = re.sub(pattern, ' ', punc_text) #to remove extra white spaces.
			third_text = re.sub(third_layer, ' ', text)  #to remove all other characters except for text
			tokenize = nltk.word_tokenize(third_text)
			lowered_text = [w.lower().strip() for w in tokenize] 
			text_without_stopwords = [w for w in lowered_text if not w in stop_words]
			# # print text_without_stopwords
			corpus.append(text_without_stopwords)
			if count%100 == 0:
				print "Preprocessed {} revision".format(count)
		except:
			sys.exc_info()[0]
			next 
        	
  	print len(corpus)
	return corpus
def filterWiki(raw):
    """
    Filter out wiki mark-up from utf8 string `raw`, leaving only text.
    """
    # the parsing the wiki markup is not perfect, but sufficient for our purposes
    # contributions to improving this code are welcome :)
    text = utils.decode_htmlentities(unicode(raw, 'utf8', 'ignore'))
    text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
    text = re.sub(RE_P2, "", text)  # remove the last list (=languages)
    # the wiki markup is recursive (markup inside markup etc)
    # instead of writing a recursive grammar, here we deal with that by removing
    # markup in a loop, starting with inner-most expressions and working outwards,
    # as long as something changes.
    iters = 0
    while True:
        old, iters = text, iters + 1
        text = re.sub(RE_P0, "", text)  # remove comments
        text = re.sub(RE_P1, '', text)  # remove footnotes
        text = re.sub(RE_P9, "", text)  # remove outside links
        text = re.sub(RE_P10, "", text)  # remove math content
        text = re.sub(RE_P11, "", text)  # remove all remaining tags
        # remove templates (no recursion)
        text = re.sub(RE_P3, '', text)
        text = re.sub(RE_P4, '', text)
        text = re.sub(RE_P5, '\\3', text)  # remove urls, keep description
        text = re.sub(RE_P7, '\n\\3',
                      text)  # simplify images, keep description only
        text = re.sub(RE_P8, '\n\\3',
                      text)  # simplify files, keep description only
        text = re.sub(RE_P6, '\\2',
                      text)  # simplify links, keep description only
        # remove table markup
        text = text.replace('||', '\n|')  # each table cell on a separate line
        text = re.sub(RE_P12, '\n', text)  # remove formatting lines
        text = re.sub(RE_P13, '\n\\3', text)  # leave only cell content
        # remove empty mark-up
        text = text.replace('[]', '')
        if old == text or iters > 2:  # stop if nothing changed between two iterations or after a fixed number of iterations
            break

    # the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists'
    # TODO is this really desirable?
    text = text.replace('[', '').replace(
        ']', '')  # promote all remaining markup to plain text
    return text
示例#7
0
def get_article_claims(args):
    text, lemmatize, title, pageid, set_citation, quote_identifiers = args
    text = utils.to_unicode(text, 'utf8', errors='ignore')
    text = utils.decode_htmlentities(text)
    plaintext = get_plain_with_cnmarks(text, set_citation, quote_identifiers)

    claims = ArticleClaims(pageid, title, plaintext)
    claims.from_text()

    return claims
示例#8
0
def filter_wiki(raw):
    """
    Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
    or utf-8 encoded string.
    """
    # parsing of the wiki markup is not perfect, but sufficient for our purposes
    # contributions to improving this code are welcome :)
    text = utils.decode_htmlentities(utils.to_unicode(raw, 'utf8', errors='ignore'))
    text = utils.decode_htmlentities(text) # '&amp;nbsp;' --> '\xa0'
    text = re.sub(RE_P2, "", text) # remove the last list (=languages)
    # the wiki markup is recursive (markup inside markup etc)
    # instead of writing a recursive grammar, here we deal with that by removing
    # markup in a loop, starting with inner-most expressions and working outwards,
    # for as long as something changes.
    iters = 0
    while True:
        old, iters = text, iters + 1
        text = re.sub(RE_P0, "", text) # remove comments
        text = re.sub(RE_P1, '', text) # remove footnotes
        text = re.sub(RE_P9, "", text) # remove outside links
        text = re.sub(RE_P10, "", text) # remove math content
        text = re.sub(RE_P11, "", text) # remove all remaining tags
        # remove templates (no recursion)
        text = re.sub(RE_P3, '', text)
        text = re.sub(RE_P4, '', text)
        text = re.sub(RE_P5, '\\3', text) # remove urls, keep description
        text = re.sub(RE_P7, '\n\\3', text) # simplify images, keep description only
        text = re.sub(RE_P8, '\n\\3', text) # simplify files, keep description only
        text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only
        # remove table markup
        text = text.replace('||', '\n|') # each table cell on a separate line
        text = re.sub(RE_P12, '\n', text) # remove formatting lines
        text = re.sub(RE_P13, '\n\\3', text) # leave only cell content
        # remove empty mark-up
        text = text.replace('[]', '')
        if old == text or iters > 2: # stop if nothing changed between two iterations or after a fixed number of iterations
            break

    # the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists'
    # TODO is this really desirable?
    text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text
    return text
def iterate_wiki(input_path):
    lemmatize = utils.has_pattern()
    filter_namespaces = ('0',)
    texts = ((text, lemmatize, title, pageid) for title, text, pageid in
             extract_pages(bz2.BZ2File(input_path), filter_namespaces))
    for article in texts:
        text, lemmatize, title, pageid = article
        text = utils.to_unicode(text, 'utf8', errors='ignore')
        text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
        text = remove_markup(text)
        tokens = get_all_words(text)
        if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
            continue
        yield title, tokens
示例#10
0
def filter_wiki(raw, promote_remaining=True, simplify_links=True):
    """Filter out wiki markup from `raw`, leaving only text.

    Parameters
    ----------
    raw : str
        Unicode or utf-8 encoded string.
    promote_remaining : bool
        Whether uncaught markup should be promoted to plain text.
    simplify_links : bool
        Whether links should be simplified keeping only their description text.

    Returns
    -------
    str
        `raw` without markup.
    """
    # parsing of the wiki markup is not perfect, but sufficient for our purposes
    # contributions to improving this code are welcome :)
    text = utils.to_unicode(raw, 'utf8', errors='ignore')
    text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
    return remove_markup(text, promote_remaining, simplify_links)
示例#11
0
def preprocessing(revisions):
    print "preprocessing called..."
    corpus = []
    stop_words = set(stopwords.words('english'))
    punctuation = re.compile(r'[-.?!/,":;*()=%$\'/\\&_\[\]}{<>#^\-+@|0-9]')
    pattern = re.compile(r'\s\s+')
    punctuation = re.compile(r'[^a-zA-Z]+')
    third_layer = re.compile(r'(^| ).( |$)')
    # # start_time = time.time()
    count = 0
    for each_revision in revisions:
        try:
            count = count + 1
            text = utils.decode_htmlentities(
                utils.to_unicode(each_revision, 'utf8', errors='ignore'))
            # text = utils.decode_htmlentities(text)
            punc_text = punctuation.sub(" ", remove_markup(text))
            text = re.sub(pattern, ' ',
                          punc_text)  #to remove extra white spaces.
            third_text = re.sub(
                third_layer, ' ',
                text)  #to remove all other characters except for text
            tokenize = nltk.word_tokenize(third_text)
            lowered_text = [w.lower().strip() for w in tokenize]
            text_without_stopwords = [
                w for w in lowered_text if not w in stop_words
            ]
            # # print text_without_stopwords
            corpus.append(text_without_stopwords)
            if count % 100 == 0:
                print "Preprocessed {} revision".format(count)
        except:
            sys.exc_info()[0]
            next

    print len(corpus)
    return corpus
示例#12
0
def get_raw_text_and_links_from_markup(raw):
    text = utils.to_unicode(raw, 'utf8', errors='ignore')
    text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
    return __remove_markup(text)
示例#13
0
def filter_wiki(raw):
    text = utils.to_unicode(raw, 'utf8', errors='ignore')
    text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
    return remove_markup(text)#, promote_remaining=False, simplify_links=True)
示例#14
0
def process_enwikinews(s,verbose=True):
    # extract text (vs pageid and title)
    ss = RE_P17.split(s)
    if len(ss)!=3 or not ss[2]:        
        if verbose:
            print('There is no text in this acticle.')
        return None
    # [:-1] means remove the last '\n' in RE_P17
    pageid, _, title = ss[1][:-1].partition('\n')
    if any(title.lower().startswith(x) for x in IGNORED_TITLES):
        if verbose:
            print('This article does not have a normal title.')
        return None
    text = ss[2].strip()
    if any(text.lower().startswith(x) for x in IGNORED_TEXTS):
        if verbose:
            print('This article does not have a normal text.')
        return None
    # extract main content of text (i.e. remove its tail)
    text = text.partition('{{haveyoursay}}')    
    if not text[1]:
        text = text[0].partition('==')
    text = text[0].strip()
    
    # from filter_wiki()
    text = utils.to_unicode(text,'utf8',errors='ignore')
    text = utils.decode_htmlentities(text)
    
    # from remove_markup()
    text = RE_P2.sub('',text)  # remove the last list (=languages)
    ## template-related (for future: ...=...)
    ### {{Brazil}}
    text = RE_template_1.sub(r'\1\n',text)
    ### {{date|November 13, 2004}}
    text = RE_template_2.sub(_repl_or,text)
    ## file[/image]-related
    text = RE_P15.sub('',text)
    ## the rest
    iters = 0
    while True:
        old, iters = text, iters + 1
        text = RE_P0.sub('',text)  # remove comments (pageid = 1471698)
        text = RE_P1.sub('',text)  # remove footnotes
        text = RE_P9.sub('',text)  # remove outside links
        text = RE_P10.sub('MATH', text)  # remove math content
        text = RE_P11.sub('', text)  # remove all remaining tags
        text = RE_P14_edited.sub('', text)  # remove categories
        text = RE_P5.sub(r'\3', text)  # remove urls, keep description
        text = RE_P6.sub(r'\2', text)  # simplify links, keep description only
        # remove table markup
        text = text.replace('||', '\n|')  # each table cell on a separate line
        text = RE_P12.sub('\n', text)  # remove formatting lines
        text = RE_P13.sub(r'\n\3', text)  # leave only cell content
        # remove empty mark-up
        text = text.replace('[]', '')
        # stop if nothing changed between two iterations or after a fixed number of iterations
        if old == text or iters > 2:
            break
    text = text.replace('[', '').replace(']', '')  # promote all remaining markup to plain text
    # {{byline|date=November 14, 2004|location=RAMALLAH}}
    text = RE_template_3.sub(_repl_byline,text)
    # cleaning
    text = remove_template(text) # pageid = 113289
    text = re.sub('(\n)+','\n',text).strip()
    if not text:
        if verbose:
            print('This article does not have a normal text.')
        return None
    return pageid, title, text
示例#15
0
 def test_decode_entities(self):
     # create a string that fails to decode with unichr on narrow python builds
     body = u'It&#146;s the Year of the Horse. YES VIN DIESEL &#128588; &#128175;'
     expected = u'It\x92s the Year of the Horse. YES VIN DIESEL \U0001f64c \U0001f4af'
     self.assertEqual(utils.decode_htmlentities(body), expected)
    argparser = argparse.ArgumentParser(description='Wikipedia Dump Extractor')
    argparser.add_argument('-input_path', type=str, required=True, help='Path to the raw Wikipedia dump')
    argparser.add_argument('-output_path', type=str, required=True, help='Write path for extracted text content')
    return argparser.parse_args()


if __name__ == '__main__':
    arguments = config_argparser()
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    lemmatize = utils.has_pattern()
    filter_namespaces = ('0',)
    texts = ((text, lemmatize, title, pageid) for title, text, pageid in
             extract_pages(bz2.BZ2File(arguments.input_path), filter_namespaces))
    parsed_article_counter = 0
    space = u' '
    output = codecs.open(arguments.output_path, 'w', 'utf-8')
    for article in texts:
        text, lemmatize, title, pageid = article
        text = utils.to_unicode(text, 'utf8', errors='ignore')
        text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
        text = remove_markup(text)
        tokens = get_all_words(text)
        if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
            continue
        output.write("{}\n".format(space.join(tokens) + "\n"))
        parsed_article_counter += 1
    print('Parsed articles: {}', parsed_article_counter)
示例#17
0
 def test_decode_entities(self):
     # create a string that fails to decode with unichr on narrow python builds
     body = u'It&#146;s the Year of the Horse. YES VIN DIESEL &#128588; &#128175;'
     expected = u'It\x92s the Year of the Horse. YES VIN DIESEL \U0001f64c \U0001f4af'
     self.assertEquals(utils.decode_htmlentities(body), expected)
示例#18
0
 def __decode_htmlentities(self, text):
     return utils.decode_htmlentities(text)