def main():
    wiki_docs = extract_wikidocs()
    for doc in wiki_docs:
        lines = doc['text'].split('\n')
        for line in lines:
            if re.search(r'\[\[Category:\S+\]\]', line):
                print(line)
def main():
    wiki_docs = extract_wikidocs()
    for doc in wiki_docs:
        lines = doc['text'].split('\n')
        for line in lines:
            if re.search(ur'\[\[Category:.*?\]\]', line):
                print line.encode('utf-8')
示例#3
0
def main():
    docs = extract_wikidocs()
    pattern = re.compile(r'(File|ファイル):([^\|]+)')
    for doc in docs:
        # Find all markups File: or ファイル:
        references = pattern.findall(doc['text'])
        for ref in references:
            print(ref[1])
def main():
    wiki_docs = extract_wikidocs()
    for doc in wiki_docs:
        lines = doc['text'].split('\n')
        for line in lines:
            categories = re.findall(ur'\[\[Category:(.+)\]\]', line)
            for match in categories:
                for cat in match.split('|'):
                    if not re.search(ur'[\* ]', cat):
                        print cat.encode('utf-8')
def main():
    docs = extract_wikidocs()
    categories = []
    for doc in docs:
        lines = doc['text'].split('\n')
        for line in lines:
            categories += (re.findall('\[\[Category:(\S+)\]\]', line))

    for cat in categories:
        print(cat)
示例#6
0
def parse_folder():
    docs = extract_wikidocs()
    patern = re.compile('{{基礎情報.+?^}}', re.M | re.DOTALL)  # M = mutiline
    dict_list = []
    for doc in docs:
        matchs = patern.findall(doc['text'])
        for match in matchs:
            dict_list.append(parse_infobox(match))

    return dict_list
def get_infobox():
    docs = extract_wikidocs()
    objs_list = []
    pattern = re.compile(ur'{{基礎情報.+?^}}\n', re.M | re.DOTALL)
    for doc in docs:
        matches = pattern.findall(doc['text'])
        for m in matches:
            dict_obj = parse_infobox(m)
            objs_list.append(dict_obj)

    return objs_list
def main():
    wiki_docs = extract_wikidocs()

    pattern = re.compile(r'(={2,}) ([^=]+) (={2,})')
    for doc in wiki_docs:
        tuples = pattern.findall(doc['text'])
        for tp in tuples:
            pfx = tp[0]
            sfx = tp[2]
            sec = tp[1]
            orig = pfx + ' ' + sec + ' ' + sfx
            if len(pfx) != len(sfx):
                print('%s %s %s' % (pfx, sec, sfx))
                exit

            level = len(pfx) - 1

            print('%-40s Level %s\t\t%s' % (sec, level, orig))