示例#1
0
def refresh_the_jazz_page_cache():
    """
    Reads entries from The Jazz Page into a local cache for searching.
    
    @see: L{the_jazz_page_midi_files}
    
    """
    import os
    from urllib2 import urlopen
    from BeautifulSoup import BeautifulSoup
    from jazzparser.settings import LOCAL_DATA_DIR
    from jazzparser.utils.csv import UnicodeCsvWriter
    
    cache_filename = os.path.join(LOCAL_DATA_DIR, "the_midi_site_cache")
    domain = "http://www.thejazzpage.de"
    index_url = "%s/midiinfo.html" % domain
    
    if os.path.exists(cache_filename):
        # Remove the old cache file
        os.remove(cache_filename)
    # Create a new cache file
    cache_file = open(cache_filename, 'w')
    try:
        writer = UnicodeCsvWriter(cache_file)
        # Read in the index page to get the list of entries from
        soup = BeautifulSoup(urlopen(index_url).read())
        # After the first table, each one is a letter, apart from the last one
        tables = soup.findAll("table")[1:-1]
        rowspan = 0
        for table in tables:
            for row in table.findAll("tr"):
                cells = list(row)
                if rowspan == 0:
                    if cells[0].has_key("rowspan"):
                        rowspan = int(cells[0]["rowspan"]) -1
                    # This is a row with a first column in it
                    # Ignore the first column - we're only want the 2nd
                    middle_cell = cells[1]
                else:
                    middle_cell = cells[0]
                    rowspan -= 1
                # Get the file and song name from the middle cell
                if middle_cell.a is not None:
                    link = middle_cell.a["href"]
                    link = "%s/%s" % (domain,link)
                    name = middle_cell.a.text
                    name = name.replace("\n","").lower()
                    writer.writerow([link, name])
    finally:
        cache_file.close()
def refresh_the_jazz_page_cache():
    """
    Reads entries from The Jazz Page into a local cache for searching.
    
    @see: L{the_jazz_page_midi_files}
    
    """
    import os
    from urllib2 import urlopen
    from BeautifulSoup import BeautifulSoup
    from jazzparser.settings import LOCAL_DATA_DIR
    from jazzparser.utils.csv import UnicodeCsvWriter

    cache_filename = os.path.join(LOCAL_DATA_DIR, "the_midi_site_cache")
    domain = "http://www.thejazzpage.de"
    index_url = "%s/midiinfo.html" % domain

    if os.path.exists(cache_filename):
        # Remove the old cache file
        os.remove(cache_filename)
    # Create a new cache file
    cache_file = open(cache_filename, "w")
    try:
        writer = UnicodeCsvWriter(cache_file)
        # Read in the index page to get the list of entries from
        soup = BeautifulSoup(urlopen(index_url).read())
        # After the first table, each one is a letter, apart from the last one
        tables = soup.findAll("table")[1:-1]
        rowspan = 0
        for table in tables:
            for row in table.findAll("tr"):
                cells = list(row)
                if rowspan == 0:
                    if cells[0].has_key("rowspan"):
                        rowspan = int(cells[0]["rowspan"]) - 1
                    # This is a row with a first column in it
                    # Ignore the first column - we're only want the 2nd
                    middle_cell = cells[1]
                else:
                    middle_cell = cells[0]
                    rowspan -= 1
                # Get the file and song name from the middle cell
                if middle_cell.a is not None:
                    link = middle_cell.a["href"]
                    link = "%s/%s" % (domain, link)
                    name = middle_cell.a.text
                    name = name.replace("\n", "").lower()
                    writer.writerow([link, name])
    finally:
        cache_file.close()
示例#3
0
def main():
    usage = "%prog [options] <in-file>"
    description = "Reads in a sequence index file and tries to find "\
        "midi files of each song by looking up the name online. Writes "\
        "them all to the given directory."
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-i", "--index", dest="index", action="store", type="int", help="select a single sequence by index from the file and just get files for that sequence")
    parser.add_option("-n", "--name", dest="name", action="store_true", help="interpret the arguments as a song name to look up directly instead of fetching the name of a sequence from a file")
    parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="verbose output")
    parser.add_option("-s", "--source", dest="sources", action="append", help="sources to get midi files from (use option multiple times for multiple sources). Possible values: %s. Default: all sources." % ", ".join(SOURCES))
    parser.add_option("-r", "--resume", dest="resume", action="store", type="int", help="resume lookup at the given sequence index. Sequences before this index will be skipped at the names entries will be appended to an existing file.")
    parser.add_option("-d", "--dir", dest="dir", action="store", help="directory to output files to. By default, outputs to the current directory")
    options, arguments = parser.parse_args()
        
    if options.dir is not None:
        outdir = os.path.abspath(options.dir)
    else:
        outdir = os.path.abspath(os.getcwd())
        
    if not os.path.isdir(outdir):
        print >>sys.stderr, "%s is not a directory" % outdir
    
    if options.name is not None:
        sequences = [(" ".join(arguments),None)]
    else:
        if len(arguments) == 0:
            print >>sys.stderr, "You must specify an input sequence index file"
            sys.exit(1)
        filename = os.path.abspath(arguments[0])
        
        # Read in the data file
        seqs = SequenceIndex.from_file(filename)
        if options.index is not None:
            seq = seqs.sequence_by_index(options.index)
            sequences = [(seq.name,seq.id)]
        elif options.resume is not None:
            sequences = [(seq.name,seq.id) for seq in seqs.sequences[options.resume:]]
        else:
            sequences = [(s.name,s.id) for s in seqs.sequences]
    
    if options.verbose:
        verbose_out = sys.stderr
        out_prefix = ">>> "
    else:
        verbose_out = None
        out_prefix = ""
    
    # Output a name list
    if options.resume is None:
        namefile = open(os.path.join(outdir, "NAMES"), 'w')
    else:
        # Append data to the old file
        namefile = open(os.path.join(outdir, "NAMES"), 'a')
    try:
        names = UnicodeCsvWriter(namefile)
        if options.resume is None:
            # Add a header if we're not appending to an old file
            names.writerow(['Filename','Reported song name','Database id'])
        
        for seq_name,seq_id in sequences:
            print "%sLooking up %s" % (out_prefix, seq_name)
            files = find_midi_files(seq_name, sources=options.sources, verbose_out=verbose_out)
            print "%s  Found %d files" % (out_prefix, len(files))
            # Create a suitable base filename
            base_filename = "_".join(\
                seq_name.encode('ascii', 'ignore').translate(string.maketrans("",""), string.punctuation).lower().split())
            for i,(data,name) in enumerate(files):
                filename = u"%s-%d.mid" % (base_filename,i)
                full_filename = os.path.join(outdir, filename)
                # Write each midi file out individually
                f = open(full_filename, 'w')
                f.write(data)
                f.close()
                # Keep a list of the name reported for each file
                names.writerow([filename,name,seq_id])
            namefile.flush()
    finally:
        namefile.close()