def deal_with_all_files(root_path): import os for root, dirs, files in os.walk(root_path): if len(files) <= 0: continue rel_path_str = root[(len(root_path) + 1):] if len(rel_path_str) == 0 or rel_path_str.find('\\') >= 0: continue rel_type_str = rel_path_str if 'Type' in rel_type_str: type_str = 'Type-' + rel_type_str[rel_type_str.find('Type') + 5] elif any(disc_cd in rel_type_str for disc_cd in meta_parser.get_other_disc_type()): type_str = next(disc_cd for disc_cd in meta_parser.get_other_disc_type() if disc_cd in rel_type_str) else: continue if type_str not in table: continue for file in files: fn = meta_parser.normalize(file) is_off_vocal = any( word in file.lower() for word in ['off vocal', 'off cocal', 'off-vocal', 'instrument', 'music']) if not any( fn.endswith(suffix) for suffix in ['.wav', '.mp3', '.flac', '.ape', '.m4a']): continue for ((keyname, keyoff), song) in table[type_str].items(): if fn.find(keyname) >= 0 and keyoff == is_off_vocal: if not song.pure_filename or len( song.pure_filename) > len(file): song.set_file(rel_path_str, file)
def handle_with_line(thetype, line): if thetype == None: return with_performer = True line = re.sub('\[注 \d+\]', '', line) match = re.compile( r'(?P<track_id>\d+)\..*\s+「(?P<title>.*)」(\((?P<performer>[\w\s]+)\))?.*(?P<length>\d+:\d+)' ).match(line) if not match: return track_id = match.group('track_id') title = match.group('title') performer = match.group('performer') length = match.group('length') is_off_vocal = False if title.lower().find("off vocal") >= 0: is_off_vocal = True title = re.sub('\s*[\-((]?off vocal ver(\.)?[))\-]?\s*$', '', title) song = Song(track_id, title, performer, is_off_vocal, length) song.thetype = thetype pair = (meta_parser.normalize(title), is_off_vocal) if thetype not in table: table[thetype] = {} if pair not in table[thetype]: table[thetype][pair] = song
def load_wikipedia(filename): meta = {} with open(filename, encoding='utf-8') as f: content = f.readlines() content = [x.strip() for x in content] will_read = False thetype = None first_line = True mode = 'meta_mode' patch_song = None for line in content: if line == '': continue elif line.startswith('シングル収録トラック'): mode = 'track_mode' will_read = False continue elif line.startswith('選抜メンバー'): mode = 'unit_mode' will_read = False continue if mode == 'meta_mode': if first_line: first_line, album = meta_parser.parse_album(line) meta['title'] = album if line.startswith('リリース'): meta['date'] = meta_parser.parse_release_date(line) elif line.endswith('の シングル'): meta['performer'] = meta_parser.parse_performer( line, 'シングル') elif mode == 'track_mode': if line.startswith( 'Type') or line in meta_parser.get_other_disc_type(): thetype = line elif line.startswith('CD'): will_read = True elif line.startswith('DVD'): will_read = False else: if will_read: handle_with_line(thetype, meta_parser.purge_line(line)) elif mode == 'unit_mode': line = re.sub('\[[\s\w]+\]', '', line) nline = meta_parser.normalize(line) if (nline, False) in table: will_read = True patch_song = table[(nline, False)] elif will_read and patch_song and (not patch_song.performer or patch_song.performer == ''): match = re.compile(r'.*ユニット:([^),、]+).*').match(line) will_read = False if match: patch_song.performer = match.group(1) elif line.find('、') < 0 and line.find(':') < 0: patch_song.performer = line else: will_read = True return meta
def deal_with_all_files(root_path): import os for root, dirs, files in os.walk(root_path): if len(files) <= 0: continue rel_path_str = root[(len(root_path) + 1):] rel_path = rel_path_str.split(sep='\\') if len(rel_path) == 2 and rel_path[1].lower().startswith( 'disc'): # Genre 01: Type A/Disc 2 rel_type_str = rel_path[0] if rel_type_str.startswith('Type'): type_str = 'Type-' + rel_type_str[len('Type-'):] elif rel_type_str in meta_parser.get_other_disc_type(): type_str = rel_type_str else: continue rel_disc_str = rel_path_str[1] if rel_path_str.startswith('Disc '): disc_no = int(rel_disc_str[len('Disc '):]) elif rel_path_str.startswith('Disc'): disc_no = int(rel_disc_str[len('Disc'):]) if type_str not in table: continue elif len(rel_path) == 1 and rel_path[0].lower().startswith( 'disc'): # Genre 02: DISC 2 (Type-A) type_str = rel_path[0] left_branck = type_str.find('(') # of no use now, I thinks if left_branck >= 0: real_type = type_str[(left_branck + 1):(type_str.find(')'))].strip() disc_no_str = type_str[:left_branck].strip() else: real_type = '' disc_no_str = type_str else: continue for file in files: fn = meta_parser.normalize(file) if not any( fn.endswith(suffix) for suffix in ['.wav', '.mp3', '.flac', '.ape', '.m4a']): continue for keyname, song in table[type_str].items(): if (fn.find(keyname) >= 0): if not song.pure_filename or len( song.pure_filename) > len(file): song.set_file(rel_path_str, file)
def deal_with_all_files(root_path): import os for root, dirs, files in os.walk(root_path): rel_path_str = root[(len(root_path) + 1):] for file in files: fn = meta_parser.normalize(file) is_off_vocal = any(word in file.lower() for word in [ 'off vocal', 'off-vocal', 'instrument', 'music', 'instrumental' ]) if not any( fn.endswith(suffix) for suffix in ['.wav', '.mp3', '.flac', '.ape', '.m4a']): continue for ((keyname, keyoff), song) in table.items(): if fn.find(keyname) >= 0 and keyoff == is_off_vocal: if not song.pure_filename or len( song.pure_filename) > len(file): song.set_file(rel_path_str, file)
def handle_with_line_others(thetype, track_id, line): if not line or line.startswith('作詞:') or line.startswith( '作詞:') or thetype == None: return False with_performer = True match = re.compile( r'^\s*(?P<title>.+?)( - (?P<performer>.+?))?( \[(?P<length>\d+:\d+)\])?$' ).match(line) if not match: return False title = match.group('title') performer = match.group('performer') length = match.group('length') song = Song(track_id, title, performer, False, length) song.thetype = thetype no_title = meta_parser.normalize(title) if thetype not in table: table[thetype] = {} if no_title not in table[thetype]: table[thetype][no_title] = song return True
def crawl_page(single): url = wiki_base + single.link print("crawling page: " + url) page = urllib.request.urlopen(url).read().decode('utf-8') pp = BeautifulSoup(page, 'html.parser') info_table = pp.find('table', {'class': 'infobox'}) if not info_table: return False single.publish_date = meta_parser.parse_date( info_table.find('time', { 'itemprop': 'datePublished' }).text) h3 = pp.find('span', class_='mw-headline', text='シングル収録トラック').parent thetype = None mode = 'track_mode' while mode == 'track_mode': h3 = h3.next_sibling if isinstance(h3, NavigableString): continue if h3.name == 'h3': for h3_child in h3.findChildren(): if h3_child.name == 'span': type_text = str(h3_child.string) if type_text.startswith('通常盤 Type'): # NMB-17th type_text = type_text[4:] if type_text.startswith( 'Type' ) or type_text in meta_parser.get_other_disc_type(): thetype = type_text break elif h3.name == 'table' and h3.has_attr('class') and ( h3['class'] == 'tracklist' or h3['class'] == ['tracklist']): caption = h3.find('caption').text if not caption.startswith('CD'): continue for tr in h3.find_all('tr'): tds = tr.find_all('td') if len(tds) > 0: track_match = re.compile(r'(?P<track_id>\d+)\.').match( tds[0].text) if not track_match: continue track_id = track_match.group('track_id') performer = tds[1].find('small').text[1:-1] if tds[1].find( 'small') else '' title_match = re.compile(r'「(?P<title>.*)」').match( tds[1].find(text=True, recursive=False)) if not title_match: continue title = title_match.group('title') is_off_vocal = False if title.lower().find("off vocal") >= 0: is_off_vocal = True title = re.sub( '\s*[\-((~]?\s?[Oo]ff vocal ver(\.)?[\-))~]?\s*$', '', title) elif title.lower().find("instrumental") >= 0: is_off_vocal = True title = re.sub( '\s*[\-((~]?\s?[Ii]nstrumental[\-))~]?\s*$', '', title) # author[2],composer[3],combiner[4] length_match = re.compile(r'(?P<length>\d+:\d+)').match( tds[5].text) if len(tds) > 5 else None length = length_match.group( 'length') if length_match else '00:00:00' song = Song(track_id, title, performer, is_off_vocal, length) song.thetype = thetype pair = (meta_parser.normalize(title), is_off_vocal) if pair not in table: table[pair] = song elif h3.name == 'h2' and '選抜メンバー' in h3.text: mode = 'unit_mode' # not a graceful way of end of while, actually... return True