Пример #1
0
def deal_with_all_files(root_path):
    import os
    for root, dirs, files in os.walk(root_path):
        if len(files) <= 0:
            continue
        rel_path_str = root[(len(root_path) + 1):]
        if len(rel_path_str) == 0 or rel_path_str.find('\\') >= 0:
            continue
        rel_type_str = rel_path_str
        if 'Type' in rel_type_str:
            type_str = 'Type-' + rel_type_str[rel_type_str.find('Type') + 5]
        elif any(disc_cd in rel_type_str
                 for disc_cd in meta_parser.get_other_disc_type()):
            type_str = next(disc_cd
                            for disc_cd in meta_parser.get_other_disc_type()
                            if disc_cd in rel_type_str)
        else:
            continue
        if type_str not in table:
            continue
        for file in files:
            fn = meta_parser.normalize(file)
            is_off_vocal = any(
                word in file.lower() for word in
                ['off vocal', 'off cocal', 'off-vocal', 'instrument', 'music'])
            if not any(
                    fn.endswith(suffix)
                    for suffix in ['.wav', '.mp3', '.flac', '.ape', '.m4a']):
                continue
            for ((keyname, keyoff), song) in table[type_str].items():
                if fn.find(keyname) >= 0 and keyoff == is_off_vocal:
                    if not song.pure_filename or len(
                            song.pure_filename) > len(file):
                        song.set_file(rel_path_str, file)
Пример #2
0
def handle_with_line(thetype, line):
    if thetype == None:
        return
    with_performer = True
    line = re.sub('\[注 \d+\]', '', line)
    match = re.compile(
        r'(?P<track_id>\d+)\..*\s+「(?P<title>.*)」(\((?P<performer>[\w\s]+)\))?.*(?P<length>\d+:\d+)'
    ).match(line)
    if not match:
        return
    track_id = match.group('track_id')
    title = match.group('title')
    performer = match.group('performer')
    length = match.group('length')
    is_off_vocal = False
    if title.lower().find("off vocal") >= 0:
        is_off_vocal = True
        title = re.sub('\s*[\-((]?off vocal ver(\.)?[))\-]?\s*$', '', title)
    song = Song(track_id, title, performer, is_off_vocal, length)
    song.thetype = thetype
    pair = (meta_parser.normalize(title), is_off_vocal)
    if thetype not in table:
        table[thetype] = {}
    if pair not in table[thetype]:
        table[thetype][pair] = song
Пример #3
0
def load_wikipedia(filename):
    meta = {}
    with open(filename, encoding='utf-8') as f:
        content = f.readlines()
        content = [x.strip() for x in content]
        will_read = False
        thetype = None
        first_line = True
        mode = 'meta_mode'
        patch_song = None
        for line in content:
            if line == '':
                continue
            elif line.startswith('シングル収録トラック'):
                mode = 'track_mode'
                will_read = False
                continue
            elif line.startswith('選抜メンバー'):
                mode = 'unit_mode'
                will_read = False
                continue

            if mode == 'meta_mode':
                if first_line:
                    first_line, album = meta_parser.parse_album(line)
                    meta['title'] = album
                if line.startswith('リリース'):
                    meta['date'] = meta_parser.parse_release_date(line)
                elif line.endswith('の シングル'):
                    meta['performer'] = meta_parser.parse_performer(
                        line, 'シングル')
            elif mode == 'track_mode':
                if line.startswith(
                        'Type') or line in meta_parser.get_other_disc_type():
                    thetype = line
                elif line.startswith('CD'):
                    will_read = True
                elif line.startswith('DVD'):
                    will_read = False
                else:
                    if will_read:
                        handle_with_line(thetype, meta_parser.purge_line(line))
            elif mode == 'unit_mode':
                line = re.sub('\[[\s\w]+\]', '', line)
                nline = meta_parser.normalize(line)
                if (nline, False) in table:
                    will_read = True
                    patch_song = table[(nline, False)]
                elif will_read and patch_song and (not patch_song.performer or
                                                   patch_song.performer == ''):
                    match = re.compile(r'.*ユニット:([^),、]+).*').match(line)
                    will_read = False
                    if match:
                        patch_song.performer = match.group(1)
                    elif line.find('、') < 0 and line.find(':') < 0:
                        patch_song.performer = line
                    else:
                        will_read = True
    return meta
Пример #4
0
def deal_with_all_files(root_path):
    import os
    for root, dirs, files in os.walk(root_path):
        if len(files) <= 0:
            continue
        rel_path_str = root[(len(root_path) + 1):]
        rel_path = rel_path_str.split(sep='\\')
        if len(rel_path) == 2 and rel_path[1].lower().startswith(
                'disc'):  # Genre 01: Type A/Disc 2
            rel_type_str = rel_path[0]
            if rel_type_str.startswith('Type'):
                type_str = 'Type-' + rel_type_str[len('Type-'):]
            elif rel_type_str in meta_parser.get_other_disc_type():
                type_str = rel_type_str
            else:
                continue
            rel_disc_str = rel_path_str[1]
            if rel_path_str.startswith('Disc '):
                disc_no = int(rel_disc_str[len('Disc '):])
            elif rel_path_str.startswith('Disc'):
                disc_no = int(rel_disc_str[len('Disc'):])
            if type_str not in table:
                continue
        elif len(rel_path) == 1 and rel_path[0].lower().startswith(
                'disc'):  # Genre 02: DISC 2 (Type-A)
            type_str = rel_path[0]
            left_branck = type_str.find('(')  # of no use now, I thinks
            if left_branck >= 0:
                real_type = type_str[(left_branck +
                                      1):(type_str.find(')'))].strip()
                disc_no_str = type_str[:left_branck].strip()
            else:
                real_type = ''
                disc_no_str = type_str
        else:
            continue
        for file in files:
            fn = meta_parser.normalize(file)
            if not any(
                    fn.endswith(suffix)
                    for suffix in ['.wav', '.mp3', '.flac', '.ape', '.m4a']):
                continue
            for keyname, song in table[type_str].items():
                if (fn.find(keyname) >= 0):
                    if not song.pure_filename or len(
                            song.pure_filename) > len(file):
                        song.set_file(rel_path_str, file)
Пример #5
0
def deal_with_all_files(root_path):
    import os
    for root, dirs, files in os.walk(root_path):
        rel_path_str = root[(len(root_path) + 1):]
        for file in files:
            fn = meta_parser.normalize(file)
            is_off_vocal = any(word in file.lower() for word in [
                'off vocal', 'off-vocal', 'instrument', 'music', 'instrumental'
            ])
            if not any(
                    fn.endswith(suffix)
                    for suffix in ['.wav', '.mp3', '.flac', '.ape', '.m4a']):
                continue
            for ((keyname, keyoff), song) in table.items():
                if fn.find(keyname) >= 0 and keyoff == is_off_vocal:
                    if not song.pure_filename or len(
                            song.pure_filename) > len(file):
                        song.set_file(rel_path_str, file)
Пример #6
0
def handle_with_line_others(thetype, track_id, line):
    if not line or line.startswith('作詞:') or line.startswith(
            '作詞:') or thetype == None:
        return False
    with_performer = True
    match = re.compile(
        r'^\s*(?P<title>.+?)( - (?P<performer>.+?))?( \[(?P<length>\d+:\d+)\])?$'
    ).match(line)
    if not match:
        return False
    title = match.group('title')
    performer = match.group('performer')
    length = match.group('length')
    song = Song(track_id, title, performer, False, length)
    song.thetype = thetype
    no_title = meta_parser.normalize(title)
    if thetype not in table:
        table[thetype] = {}
    if no_title not in table[thetype]:
        table[thetype][no_title] = song
    return True
Пример #7
0
def crawl_page(single):
    url = wiki_base + single.link
    print("crawling page: " + url)
    page = urllib.request.urlopen(url).read().decode('utf-8')
    pp = BeautifulSoup(page, 'html.parser')

    info_table = pp.find('table', {'class': 'infobox'})
    if not info_table:
        return False
    single.publish_date = meta_parser.parse_date(
        info_table.find('time', {
            'itemprop': 'datePublished'
        }).text)

    h3 = pp.find('span', class_='mw-headline', text='シングル収録トラック').parent
    thetype = None
    mode = 'track_mode'
    while mode == 'track_mode':
        h3 = h3.next_sibling
        if isinstance(h3, NavigableString):
            continue
        if h3.name == 'h3':
            for h3_child in h3.findChildren():
                if h3_child.name == 'span':
                    type_text = str(h3_child.string)
                    if type_text.startswith('通常盤 Type'):  # NMB-17th
                        type_text = type_text[4:]
                    if type_text.startswith(
                            'Type'
                    ) or type_text in meta_parser.get_other_disc_type():
                        thetype = type_text
                        break
        elif h3.name == 'table' and h3.has_attr('class') and (
                h3['class'] == 'tracklist' or h3['class'] == ['tracklist']):
            caption = h3.find('caption').text
            if not caption.startswith('CD'):
                continue
            for tr in h3.find_all('tr'):
                tds = tr.find_all('td')
                if len(tds) > 0:
                    track_match = re.compile(r'(?P<track_id>\d+)\.').match(
                        tds[0].text)
                    if not track_match:
                        continue
                    track_id = track_match.group('track_id')
                    performer = tds[1].find('small').text[1:-1] if tds[1].find(
                        'small') else ''
                    title_match = re.compile(r'「(?P<title>.*)」').match(
                        tds[1].find(text=True, recursive=False))
                    if not title_match:
                        continue
                    title = title_match.group('title')
                    is_off_vocal = False
                    if title.lower().find("off vocal") >= 0:
                        is_off_vocal = True
                        title = re.sub(
                            '\s*[\-((~]?\s?[Oo]ff vocal ver(\.)?[\-))~]?\s*$',
                            '', title)
                    elif title.lower().find("instrumental") >= 0:
                        is_off_vocal = True
                        title = re.sub(
                            '\s*[\-((~]?\s?[Ii]nstrumental[\-))~]?\s*$', '',
                            title)
                    # author[2],composer[3],combiner[4]
                    length_match = re.compile(r'(?P<length>\d+:\d+)').match(
                        tds[5].text) if len(tds) > 5 else None
                    length = length_match.group(
                        'length') if length_match else '00:00:00'
                    song = Song(track_id, title, performer, is_off_vocal,
                                length)
                    song.thetype = thetype
                    pair = (meta_parser.normalize(title), is_off_vocal)
                    if pair not in table:
                        table[pair] = song
        elif h3.name == 'h2' and '選抜メンバー' in h3.text:
            mode = 'unit_mode'  # not a graceful way of end of while, actually...
    return True