def main(): data = load_data() pat = r'[\[\[]?(?:ファイル|File):(?P<name>[^|]+)\|' for m in re.finditer(pat, data): print(m.group('name').encode('utf8')) return 0
def main(): data = load_data() result = extract_basic_info(data) for key, val in result.items(): print('key = {}'.format(key.encode('utf8'))) print('value = {}\n'.format(val.encode('utf8'))) return 0
def main(): data = load_data() pat = r'\[\[Category:(?P<name>.+?)(\|.*)?\]\]' for m in re.finditer(pat, data): print(m.group('name').encode('utf8')) return 0
def main(): data = load_data() pat = r'\[\[Category:.+\]\]' for m in re.finditer(pat, data): print(m.group().encode('utf8')) return 0
def main(): data = load_data() info = extract_basic_info(data) for key, val in info.items(): print('key = {}'.format(key.encode('utf8'))) val = remove_emphasis(val) val = remove_internal_link(val) print('value = {}\n'.format(val.encode('utf8'))) return 0
def main(): data = load_data() pat = r'(?P<sep>==+)(?P<title>.+?)(?P=sep)' for m in re.finditer(pat, data): print('{},{}'.format(m.group('title').strip().encode('utf8'), len(m.group('sep'))-1)) return 0
def main(): data = load_data() pat = r'(?P<sep>==+)(?P<title>.+?)(?P=sep)' for m in re.finditer(pat, data): print('{},{}'.format( m.group('title').strip().encode('utf8'), len(m.group('sep')) - 1)) return 0
def main(): data = load_data() info = extract_basic_info(data) # # Tests # print(remove_emphasis("'''''aiueo'''''")) # print(remove_emphasis("''''aiueo''''")) # print(remove_emphasis("'''aiueo'''")) # print(remove_emphasis("''aiueo''")) # print(remove_emphasis("'aiueo'")) for key, val in info.items(): print('key = {}'.format(key.encode('utf8'))) print('value = {}\n'.format(remove_emphasis(val).encode('utf8'))) return 0
def main(): data = load_data() info = extract_basic_info(data) param = { 'action': 'query', 'format': 'json', 'iiprop': 'url', 'prop': 'imageinfo', 'titles': 'Image:{}'.format(info[u'国旗画像']) } url = u'http://ja.wikipedia.org/w/api.php?' + urllib.urlencode(param) try: r = urllib.urlopen(url) data = json.loads(r.read().decode('utf8')) print(data[u'query'][u'pages'][u'-1'][u'imageinfo'][0][u'url']) finally: r.close() return 0