示例#1
0
def dewiki(text):
    text = remove_double_curly(text)
    text = remove_double_brackets(text)
    text = wtp.parse(text).plain_text()
    text = htt(text)
    text = text.replace('\\n', ' ')
    text = re.sub('\[\[', ' ', text)
    text = re.sub('\]\]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text
def analyze_chunk(text):
    try:
        if '<redirect title="' in text:  # this is not the main article
            return None
        else:
            title = text.split('<title>')[1].split('</title>')[0]
            if ':' in title:  # this is a talk, category, or other (not a real article)
                return None
            title = htt(title).strip()
        serial = text.split('<id>')[1].split('</id>')[0]
        content = text.split('</text')[0].split('<text')[1].split(
            '>', maxsplit=1)[1]
        content = dewiki(content)
        return {'title': title, 'text': content, 'id': serial}
    except:
        return None
def dewiki(text):
    text = remove_simple_links(text)
    text = remove_pictures(text)
    text = remove_audio(text)
    text = remove_compound_links(text)
    text = remove_references(text)
    text = remove_citations(text)
    text = remove_categories(text)
    text = remove_all_links(text)
    text = remove_urls(text)
    text = remove_wikitables(text)  # TODO preserve this data somehow

    text = wtp.parse(text).plain_text()  # wiki to plaintext whatever is left
    text = htt(text)  # de-HTML text

    #text = re.sub('\]\]', ' ', text)  # remove any remnant brackets
    text = text.replace('\\n', ' ')  # replace newlines
    text = re.sub('\s+', ' ', text)  # replace excess whitespace
    return text
示例#4
0
def dewiki(text):
    text = text.replace('\\n', ' ')  # replace newlines
    text = re.sub('\s+', ' ', text)  # replace excess whitespace
    text = remove_audio(text)
    text = remove_references(text)
    text = remove_citations(text)
    text = remove_categories(text)
    text = remove_simple_links(text)
    text = remove_compound_links(text)
    text = remove_pictures(text)
    text = remove_all_links(text)
    text = remove_urls(text)
    # TODO handle class=\"sortable wikitable\" and class=\"wikitable\"

    text = wtp.parse(text).plain_text()  # wiki to plaintext whatever is left
    text = htt(text)  # de-HTML text

    text = re.sub('\]\]', ' ', text)  # remove any remnant brackets
    text = re.sub('\s+', ' ', text)
    return text
def analyze_chunk(text):
    try:
        if '<redirect title="' in text:  # this is not the main article
            return None
        if '(disambiguation)' in text:  # this is not an article
            return None
        else:
            title = text.split('<title>')[1].split('</title>')[0]
            title = htt(title)
            if ':' in title:  # most articles with : in them are not articles we care about
                return None
        serial = text.split('<id>')[1].split('</id>')[0]
        content = text.split('</text')[0].split('<text')[1].split(
            '>', maxsplit=1)[1]
        content = dewiki(content)
        return {
            'title': title.strip(),
            'text': content.strip(),
            'id': serial.strip()
        }
    except Exception as oops:
        print(oops)
        return None
def dewiki(text):
    text = wtp.parse(text).plain_text()  # wiki to plaintext
    text = htt(text)  # remove any HTML
    text = text.replace('\\n', ' ')  # replace newlines
    text = re.sub('\s+', ' ', text)  # replace excess whitespace
    return text