예제 #1
0
파일: oscar.py 프로젝트: tgadf/movies
    def createRawOscarData(self, debug=True):
        print("Checking for poorly parsed oscar data.")
        indir = self.wikiData.getResultsDir()
        files = sorted(findExt(indir, ext=".json"))
        if debug:
            print("Found {0} oscar files".format(len(files)))
        yearlyData = {}
        for ifile in files:
            year = getBaseFilename(ifile)
            yearlyData[year] = getFile(ifile)

        savename = setFile(self.getCorrectionsDir(), "saved.yaml")
        if not isFile(savename):
            savedData = {}
        else:
            savedData = getFile(savename)

        for year in savedData.keys():
            for title in savedData[year].keys():
                savedWinner = savedData[year][title].get("Winner")
                savedNominees = savedData[year][title].get("Nominees")
                if savedWinner is not None:
                    print("Overwritting {0} {1} winner".format(year, title))
                    yearlyData[year][title]["Winner"] = savedWinner
                if savedNominees is not None:
                    print("Overwritting {0} {1} nominees".format(year, title))
                    yearlyData[year][title]["Nominees"] = savedNominees

        savename = setFile(self.getCorrectionsDir(), "raw.yaml")
        saveFile(idata=yearlyData, ifile=savename)
예제 #2
0
    def findSearchTerms(self, minCnts=25):
        from collections import Counter
        from time import sleep
        from glob import glob

        artistsCntr = Counter()
        known = getFile(self.knownFile)
        
        files  = getFlatList([findExt(dirval, ext='.p') for dirval in self.getModValDirs()])
        for ifile in files:
        #for ifile in glob("/Volumes/Piggy/Discog/artists-datpiff/*/*.p"):
            if ifile.endswith("datPiffKnown.p"):
                continue
            tmp     = getFile(ifile)
            #print(ifile,'\t',len(tmp))
            results = [x["ArtistName"] for x in tmp]
            for artist in results:
                artists = self.mulArts.getArtistNames(artist)
                for artist in artists.keys():
                    key = artist.title()
                    if len(key) > 1 and key not in known:
                        artistsCntr[key] += 1
        searchTerms = [item[0] for item in artistsCntr.most_common() if item[1] >= minCnts]
        print("There are {0} new searches".format(len(searchTerms)))
        return searchTerms
예제 #3
0
    def saveCorrections(self, debug=True):
        corrsavename = setFile(self.getDataDir(), "corr.yaml")
        corrData = getFile(corrsavename)        

        try:
            savename = setFile(self.getDataDir(), "saved.yaml")
            savedData = getFile(savename)
        except:
            raise ValueError("Could not access saved data!")
            savedData = {}

        if corrData is None:
            print("There is no corrections data.")
        else:
            print("Found {0} old corrections".format(len(savedData)))
            print("Found {0} new corrections".format(len(corrData)))
            for movie,corrs in corrData.items():
                if savedData.get(movie) is None:
                    if debug:
                        print("Adding {0}".format(movie))
                    savedData[movie] = corrs
                else:
                    newSaved = list(set(savedData[movie] + corrs))
                    if len(newSaved) != len(savedData[movie]):
                        print("Adding new corrections to {0}".format(movie))
                    savedData[movie] = newSaved

            try:
                savename = setFile(self.getDataDir(), "saved.yaml")
                saveFile(idata=savedData, ifile=savename, debug=debug)        
                print("There are {0} total corrections".format(len(savedData)))
            except:
                raise ValueError("There was an error saving the saved corrctions yaml file!")
예제 #4
0
 def parse(self, expr, force=False, debug=False, quiet=False):
     ts = timestat("Parsing Raw Files")  
     
     tsFiles  = timestat("Finding Files To Parse")
     newFiles = self.getArtistRawFiles(datatype=self.datatype, expr=expr, force=force)
     tsFiles.stop()
         
     N = len(newFiles)
     tsParse = timestat("Parsing {0} New Raw Files".format(N))
     
     newData = 0
     modValue = 250 if N >= 500 else 50
     for i,ifile in enumerate(newFiles):
         if (i+1) % modValue == 0 or (i+1) == N:
             tsParse.update(n=i+1, N=N)
             #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile))
         htmldata = getFile(ifile)
         retval   = self.artist.getData(ifile)
         artistID = retval.ID.ID
         if artistID is None:
             continue
         savename = self.dutils.getArtistSavename(artistID)
         if savename is None:
             continue
         saveFile(idata=htmldata, ifile=savename, debug=False)
         newData += 1
         
     print("Created {0}/{1} New Artist Files".format(newData, N))
     tsParse.stop()
예제 #5
0
    def parse(self, expr, force=False, debug=False, quiet=False):
        ts = timestat("Parsing Raw HTML Files")
        
        tsFiles  = timestat("Finding Files To Parse")
        newFiles = self.getArtistRawHTMLFiles(expr, force)
        tsFiles.stop()
        if debug:
            print("Parsing {0} Raw HTML Files From Expr[{1}]".format(len(newFiles), expr))

        N = len(newFiles)
        modValue = 250 if N >= 500 else 50
        tsParse = timestat("Parsing {0} Raw HTML Files".format(N))
        for i,ifile in enumerate(newFiles):
            if (i+1) % modValue == 0 or (i+1) == N or debug:
                tsParse.update(n=i+1, N=N)
                #print("{0: <15}Parsing {1}".format("{0}/{1}".format(i+1,N), ifile))
            
            if debug:
                print("{0}/{1}\tParsing {2}".format(i,N,ifile))
            htmldata = getFile(ifile)
            retval   = self.artist.getData(ifile)
            artistID = retval.ID.ID
            if debug:
                print("  ---> ID={0}".format(artistID))
            savename = self.dutils.getArtistSavename(artistID)
            saveFile(idata=htmldata, ifile=savename, debug=False)        
        
        tsParse.stop()
        ts.stop()
예제 #6
0
    def parseBAFTACategoryData(self, ifile, category, debug=False):
        htmldata = getFile(ifile)
        bsdata = getHTML(htmldata)

        data = {}
        done = False
        tables = bsdata.findAll("table", {"class": "wikitable"})
        if debug:
            print("  Found {0} tables".format(len(tables)))
        for table in tables:
            if category == "Best_Direction":
                yeardata = self.parseBAFTADirectorData(table,
                                                       category,
                                                       debug=False)
            else:
                yeardata = self.parseBAFTAFilmData(table,
                                                   category,
                                                   debug=False)
            data = {**data, **yeardata}

        for year, yearData in data.items():
            for category in yearData.keys():
                data[year][category] = list(set(data[year][category]))

        return data
예제 #7
0
    def parseAndDownloadTeamYearlyStandings(self):
        files = findExt(self.getSeasonDir(), ext=".p", debug=False)
        for ifile in files:
            year = getBaseFilename(ifile)
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)

            idVals = {}
            links = bsdata.findAll("a")
            for link in links:
                attrs = link.attrs
                if attrs.get("data-clubhouse-uid") is not None:
                    href = attrs['href']
                    name = getBasename(href)
                    idval = getBasename(getDirname(href))

                    if idVals.get(idval) is not None:
                        if idVals[idval] != name:
                            raise ValueError("Error in ID for this year!")
                    idVals[idval] = name

            for idVal, name in idVals.items():
                self.downloadTeamDataByYear(idVal,
                                            name,
                                            season=str(year),
                                            debug=True)
예제 #8
0
    def rmIDFromDB(self, artistID, modValue=None):
        print("Trying to remove data from ArtistID {0}".format(artistID))
        if modValue is None:
            modValue  = self.dutils.getDiscIDHashMod(discID=artistID, modval=self.disc.getMaxModVal())
        artistDBDir = self.disc.getArtistsDBDir()
        dbname  = setFile(artistDBDir, "{0}-DB.p".format(modValue))     
        print("Loading {0}".format(dbname))
        dbdata  = getFile(dbname)
        
        saveVal = False

        if isinstance(artistID, str):
            artistID = [artistID]
        elif not isinstance(artistID, list):
            raise ValueError("Not sure what to do with {0}".format(artistID))
            
        for ID in artistID:
            try:
                del dbdata[ID]
                print("Deleted {0}".format(ID))
                saveVal = True
            except:
                print("Not there...")

            self.rmIDFiles(ID)

        if saveVal:
            print("Saving {0}".format(dbname))
            saveFile(idata=dbdata, ifile=dbname)
        else:
            print("No reason to save {0}".format(dbname))
예제 #9
0
파일: filmsite.py 프로젝트: tgadf/movies
    def parseFilmsiteYearlyData(self, ifile, debug=False):
        htmldata = getFile(ifile)
        bsdata = getHTML(htmldata)

        movies = []

        tables = bsdata.findAll("table")
        tables = tables[1:]
        for table in tables:
            trs = table.findAll("tr")
            trs = trs[1:]
            for tr in trs:
                tds = tr.findAll("td")
                if len(tds) == 2:
                    mdata = tds[1].find("b")
                    if mdata is not None:
                        movie = mdata.text
                        movie = "".join(
                            [c for c in movie if ord(c) not in [10, 13]])
                        while movie.find("  ") != -1:
                            movie = movie.replace("  ", " ")
                        pos = movie.rfind("(")
                        if pos != -1:
                            movie = movie[:pos].strip()
                        movies.append(movie)

        return movies
예제 #10
0
파일: razzies.py 프로젝트: tgadf/movies
    def parseRazziesCategoryData(self, ifile, category, debug=False):
        htmldata = getFile(ifile)
        bsdata = getHTML(htmldata)

        data = {}
        done = False
        tables = bsdata.findAll("table", {"class": "wikitable"})
        if debug:
            print("  Found {0} tables".format(len(tables)))
        for table in tables:
            caption = table.find("caption")

            if caption is None:
                yeardata = self.parseRazziesActingData(table,
                                                       category,
                                                       debug=False)
                data = {**data, **yeardata}
            else:
                yeardata = self.parseRazziesFilmData(table,
                                                     category,
                                                     debug=False)
                data = {**data, **yeardata}

        for year, yearData in data.items():
            for category in yearData.keys():
                data[year][category] = list(set(data[year][category]))

        return data
예제 #11
0
    def processFlopsData(self, debug=False):
        outdir = self.getDataDir()
        files = findExt(outdir, ext=".html")

        from collections import OrderedDict
        movies = OrderedDict()
        yearlyData = {}
        for ifile in files:
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)

            tables = bsdata.findAll("table", {"class": "wikitable"})
            for table in tables:

                trs = table.findAll("tr")

                try:
                    ths = trs[0].findAll("th")
                    ths = [x.text for x in ths]
                    ths = [x.replace("\n", "") for x in ths]
                except:
                    raise ValueError("Could not get headers")

                print(ths)

                for itr, tr in enumerate(trs[2:]):

                    ths = tr.findAll("th")
                    try:
                        movie = ths[0].text
                        movie = movie.replace("\n", "").strip()
                        movie = movie.replace("[nb 2]", "")
                    except:
                        raise ValueError(
                            "Could not find movie in {0}".format(ths))

                    tds = tr.findAll("td")
                    try:
                        year = tds[0].text
                        year = int(year)
                    except:
                        raise ValueError(
                            "Could not find year in {0}".format(tds))

                    print(year, '\t', movie)

                    if yearlyData.get(year) is None:
                        yearlyData[year] = []
                    yearlyData[year].append(movie)

        for year in sorted(yearlyData.keys()):
            movies[year] = []
            for movie in yearlyData[year]:
                movies[year].append([movie, 10])

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of flops Data to {1}".format(
            len(movies), savename))
        saveFile(savename, movies)
예제 #12
0
    def parseArtistFiles(self, force=False, debug=False):   
        from glob import glob
        
        artistDir = self.disc.getArtistsDir()
        
        artistDBData = {}
                
        files = findExt(self.knownDir, ext='.p')        
        files = glob("/Volumes/Biggy/Discog/artists-datpiff/*/*.p")
        print("Found {0} downloaded search terms".format(len(files)))
        for i,ifile in enumerate(files):
            if ifile.endswith("datPiffKnown.p"):
                continue
            fileresults = getFile(ifile)
            if debug:
                print(i,'/',len(files),'\t',ifile)
            for j,fileresult in enumerate(fileresults):
                if debug:
                    print("  ",j,'/',len(fileresults))
                mixArtists  = fileresult["ArtistName"]
                albumName   = fileresult["AlbumName"]
                albumURL    = fileresult["AlbumURL"]
                
                mixArtistNames = self.mulArts.getArtistNames(mixArtists)
                mixArtistNames = [x.title() for x in mixArtistNames.keys()]
                
                for artistName in mixArtistNames:
                    artistID   = str(self.dutils.getArtistID(artistName))
                    albumID    = str(self.dutils.getArtistID(albumName))
                    modval     = self.dutils.getArtistModVal(artistID)
                    if artistDBData.get(modval) is None:
                        artistDBData[modval] = {}
                    if artistDBData[modval].get(artistName) is None:
                        artistDBData[modval][artistName] = {"Name": artistName, "ID": artistID, "URL": None, "Profile": None, "Media": []}
                    albumData = {"Artists": mixArtistNames, "Name": albumName, "URL": albumURL, "Code": albumID}
                    artistDBData[modval][artistName]["Media"].append(albumData)

                    
                    
                    
        maxModVal   = self.disc.getMaxModVal()
        artistDBDir = self.disc.getArtistsDBDir()     
        totalSaves  = 0
        for modVal,modvaldata in artistDBData.items():
            dbData = {}
            for artistName, artistData in modvaldata.items():
                self.artist.setData(artistData)
                artistVal = self.artist.parse()
                dbData[artistVal.ID.ID] = artistVal
                        
            savename = setFile(artistDBDir, "{0}-DB.p".format(modVal))
            print("Saving {0} artist IDs to {1}".format(len(dbData), savename))
            totalSaves += len(dbData)
            saveFile(idata=dbData, ifile=savename)
            
            self.createArtistModValMetadata(modVal=modVal, db=dbData, debug=debug)
            self.createArtistAlbumModValMetadata(modVal=modVal, db=dbData, debug=debug)
            
        print("Saved {0} new artist IDs".format(totalSaves))
예제 #13
0
    def __init__(self, path, chart, debug=False):
        self.debug = debug
        self.chart = chart
        self.path = path
        if chart is None:
            fullChartData = {}
            artistAlbumData = {}
            for chart in [
                    "MusicVF", "Billboard", "BillboardYE", "RateYourMusic",
                    "RateYourMusicSong", "RateYourMusicList",
                    "RateYourMusicList2"
            ]:
                print(chart)
                fullChartData.update(
                    getFile(
                        setFile(
                            path,
                            "current{0}FullChartArtistAlbumData.p".format(
                                chart.lower()))))
                print("There are {0} artists in the full chart data".format(
                    len(fullChartData)))
                artistAlbumData.update(
                    getFile(
                        setFile(
                            path, "current{0}ArtistAlbumData.p".format(
                                chart.lower()))))
                print("There are {0} artists in the artist album data".format(
                    len(artistAlbumData)))
            self.fullChartData = fullChartData
            self.artistAlbumData = artistAlbumData
        else:
            self.fullChartData = getFile(
                setFile(
                    path, "current{0}FullChartArtistAlbumData.p".format(
                        chart.lower())))
            print("There are {0} artists in the full chart data".format(
                len(self.fullChartData)))
            self.artistAlbumData = getFile(
                setFile(path,
                        "current{0}ArtistAlbumData.p".format(chart.lower())))
            print("There are {0} artists in the artist album data".format(
                len(self.artistAlbumData)))

        self.artistData = {}

        self.artistKeyToNameMap = {}
예제 #14
0
 def getMyMovies(self, debug=False): 
     savename = setFile(self.getDataDir(), "mymovies.json")
     if not isFile(savename):
         raise ValueError("Cannot access {0}".format(savename))
     mine = getFile(savename)
     if debug:
         print("Found {0} my movies".format(len(mine)))
     return mine
예제 #15
0
 def parseArtistMetadataFiles(self, debug=False):   
     artistDBDir = self.disc.getArtistsDBDir()   
     maxModVal   = self.disc.getMaxModVal()
     for modVal in range(maxModVal):
         savename = setFile(artistDBDir, "{0}-DB.p".format(modVal))     
         dbdata   = getFile(savename)
         self.createArtistModValMetadata(modVal=modVal, db=dbdata, debug=debug)
         self.createArtistAlbumModValMetadata(modVal=modVal, db=dbdata, debug=debug)
예제 #16
0
 def getCombinedMovies(self, debug=False):        
     savename = setFile(self.combine.getResultsDir(), "movies.json")
     if not isFile(savename):
         raise ValueErrro("Cannot access {0}".format(savename))
     combinedMovies = getFile(savename)
     if debug:
         print("Found {0} combined movies".format(len(combinedMovies)))
     return combinedMovies
예제 #17
0
    def assertDBModValExtraData(self, modVal, minPages=1, maxPages=None, allowMulti=False, test=True, clean=True):
        print("assertDBModValExtraData(",modVal,")")
        artistDBDir = self.disc.getArtistsDBDir()
        dbname  = setFile(artistDBDir, "{0}-DB.p".format(modVal))     
        dbdata  = getFile(dbname)
        nerrs   = 0
        #ignores = self.artistIgnoreList()

        
        for artistID,artistData in dbdata.items():
            first = True
            pages = artistData.pages
            if pages.more is True:
                npages = pages.pages
                if npages < minPages:
                    continue
                if maxPages is not None:
                    npages = min([npages, maxPages])
                artistRef = artistData.url.url
                #if artistData.artist.name in ignores:
                #    print("\tNot downloading artist in ignore list: {0}".format(artistData.artist.name))
                #    continue
                    
                #savename = self.dutils.getArtistSavename(artistID)
                #removeFile(savename)
                #print("\t---> {0} / {1}   {2}".format(1, pages.pages, savename))

                #print(artistID,'\t',npages,'\t')
                #continue
                    
                    
                for p in range(1, npages+1):
                    if p == 1:
                        url      = self.getArtistURL(artistRef)
                        savename = self.dutils.getArtistSavename(artistID)
                    else:
                        url      = self.getArtistURL(artistRef, p)
                        savename = self.dutils.getArtistSavename(artistID, p)
                    print("\t---> {0} / {1}   {2}".format(p, pages.pages, url))
                    
                    if clean is True:
                        if isFile(savename):
                            print("Removing {0}".format(savename))
                            removeFile(savename)
                        
                    if test is True:
                        print("\t\tWill download: {0}".format(url))
                        print("\t\tJust testing... Will not download anything.")
                        continue
                        
                    if not isFile(savename):
                        if first:
                            print("{0: <20}{1: <10}{2}".format(artistID,pages.tot,artistData.artist.name))
                            first = False

                        print("{0: <20}{1: <10}{2}".format(artistID, "{0}/{1}".format(p,pages.pages), url))
                        self.dutils.downloadArtistURL(url=url, savename=savename, force=True)
                        sleep(3)
예제 #18
0
    def parseBoxOfficeMojo(self, ifile, debug=False):
        htmldata = getFile(ifile)
        bsdata = getHTML(htmldata)
        tbl = None
        for table in bsdata.findAll("table"):
            if tbl:
                break
            for tr in table.findAll("tr"):
                if len(tr) >= 10:
                    tbl = table
                    break
                else:
                    break

        #print len(tbl)
        keys = []
        data = []
        for i, tr in enumerate(tbl):
            vals = []
            if i == 0:
                for j, td in enumerate(tr.findAll("td")):
                    for ref in td.findAll("a"):
                        key = ref.string
                        keys.append(key)
            else:
                if len(tr) <= 1: continue
                #print "\n\n\nNext...."
                #print tr
                #print "  tr-->",tr,'\t',len(tr)
                #print i,tr,len(data)
                for j, td in enumerate(tr.findAll("td")):
                    if td.string == None:
                        continue
                    try:
                        if re.search("TOTAL \((\d+) MOVIES\)", td.string):
                            break
                    except:
                        print(j, td.string)
                        raise ()
                    key = keys[j]
                    val = td.string
                    vals.append(val)
                    #print j,'\t',keys[j],'\t',td.string
                if len(vals) == 0: break
                if len(vals) != len(keys):
                    print("Mismatch with keys/data")
                    print(len(keys), '\t', keys)
                    print(len(vals), '\t', vals)
                    raise ("YO")
                    break
                else:
                    data.append(vals)

        if debug:
            print("Found", len(data), "movies from", ifile)
        return data
예제 #19
0
    def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False):
        outdir = self.getDataDir()
        if procYear == None:
            files = findExt(outdir, ext=".p")
        else:
            files = findPatternExt(outdir, pattern=str(procYear), ext=".p")

        from collections import OrderedDict
        movieData = OrderedDict()
        for ifile in sorted(files):
            #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p"
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)
            year = getBaseFilename(ifile)

            data = {}
            done = False
            tables = bsdata.findAll("table")  #, {"id": "table_3"})
            movies = {}
            for it, table in enumerate(tables):
                ths = table.findAll("th")
                trs = table.findAll("tr")
                for itr, tr in enumerate(trs):
                    tds = tr.findAll("td")
                    if len(tds) == 11:
                        val = removeTag(tds[1], 'span')
                        film = val.text
                        film = film.replace(" ({0})".format(year), "")
                        try:
                            rank = float(tds[-1].text)
                        except:
                            try:
                                rank = float(tds[-2].text)
                            except:
                                raise ValueError(tds[-1], tds[-2], tr)

                        movies[film] = rank

            movieData[year] = movies

        yearlyData = {}
        for year in sorted(movieData.keys()):
            yearlyData[year] = sorted(movieData[year].items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)
            print("---->", year,
                  " (Top 5/{0} Movies) <----".format(len(yearlyData[year])))
            for item in yearlyData[year][:5]:
                print(item)
            print('\n')

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format(
            len(yearlyData), savename))
        saveFile(savename, yearlyData)
예제 #20
0
    def parseRottenTomatoesFile(self, ifile, debug=False):
        movies = {}

        if debug:
            print("Parsing {0}".format(ifile))
        htmldata = getFile(ifile)
        bsdata = getHTML(htmldata)
        table = bsdata.find("table", {"class": "table"})
        if table:
            keys = []
            for tr in table.findAll("tr"):
                if len(keys) == 0:
                    for th in tr.findAll("th"):
                        key = th.string
                        if key == None:
                            key = " ".join(
                                [x.string for x in th.findAll("span")])
                        keys.append(key)
                        #print key
                else:
                    line = []
                    for i, td in enumerate(tr.findAll("td")):
                        #print i,'\t',td
                        if i == 0 or i == 3:
                            val = td.string
                        if i == 1:
                            for span in td.findAll("span"):
                                if span.string:
                                    val = span.string
                                    break
                        if i == 2:
                            ref = td.find("a")
                            #link = ref.attrs["href"]
                            val = ref.string

                        val = val.strip()
                        line.append(val)
                        #print i,'\t',val.strip()

                    movie = line[2]
                    rating = line[1]
                    rating = rating.replace("%", "")
                    rating = int(rating)
                    retval = re.search("\((\d+)\)", movie)
                    if retval:
                        year = retval.group()
                        movie = movie.replace(year, "").strip()
                        year = retval.groups()[0]
                    #retval = search(r'(%d+)', movie)
                    if movies.get(year) == None:
                        movies[year] = {}
                    movies[year][movie] = rating
                    #print year,'\t',rating,'\t',movie

        return movies
예제 #21
0
 def searchBoxOfficeMojo(self, movie, debug=False):
     savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
     data = getFile(savename)
     print("Nearest matches for {0}".format(movie))
     for year, yearlyMovies in data.items():
         result = findNearest(movie, [x[0] for x in yearlyMovies],
                              num=1,
                              cutoff=0.9)
         if len(result) > 0:
             values = [(name, value) for name, value in yearlyMovies
                       if name in result]
             print("{0: <6}{1}".format(year, values))
예제 #22
0
 def parseDownloadedFiles(self):
     artistDir = self.disc.getArtistsDir()
     dataDir = setDir(artistDir, "data")
     files = findPatternExt(dataDir,
                            pattern="Discography and Albums",
                            ext=".htm")
     for ifile in files:
         htmldata = getFile(ifile)
         retval = self.getData(ifile)
         artistID = retval.ID.ID
         savename = self.getArtistSavename(artistID)
         saveFile(idata=htmldata, ifile=savename, debug=True)
예제 #23
0
 def parseDownloadedFiles(self, previousDays=None, force=False):
     artistDir = self.disc.getArtistsDir()
     files = self.getArtistRawHTMLFiles(previousDays=None, force=False)
     return
     dataDir = setDir(artistDir, "data")
     files = findPatternExt(dataDir, pattern="Rate Your Music", ext=".html")
     for ifile in files:
         htmldata = getFile(ifile)
         retval = self.artist.getData(ifile)
         artistID = retval.ID.ID
         savename = self.dutils.getArtistSavename(artistID)
         saveFile(idata=htmldata, ifile=savename, debug=False)
예제 #24
0
    def moveMyMatchedMusicAlbums(self, show=False):
        rename = True
        albumsToMove = getFile(ifile=self.moveFilename)
        print("Found {0} music <-> discogs albums maps".format(
            len(albumsToMove)))

        for db, dbValues in albumsToMove.items():
            if dbValues is None:
                continue
            for artistName, artistAlbums in dbValues.items():
                print("==>", artistName)
                for myAlbumName, albumVals in artistAlbums.items():
                    dirval = albumVals["Dir"]
                    albumVal = albumVals["Album"]
                    ratio = albumVals["Ratio"]

                    dbAlbumName = albumVal["Name"]
                    dbAlbumCode = albumVal["Code"]
                    mediaType = albumVal["MediaType"]

                    matchedDir = setDir(dirval, "Match")
                    mkDir(matchedDir)

                    srcName = myAlbumName
                    srcDir = setDir(dirval, srcName)
                    if not isDir(srcDir):
                        print("{0} does not exist".format(srcDir))
                        continue

                    mediaDir = setDir(matchedDir, self.discConv(mediaType))
                    mkDir(mediaDir)

                    if rename is True:
                        dstName = self.getMatchedDirName(
                            self.discConv(dbAlbumName), dbAlbumCode, db)
                    else:
                        dstName = self.getMatchedDirName(
                            myAlbumName, dbAlbumCode, db)

                    if show is True:
                        print('\t{0}'.format(mediaDir))
                        print("\t\t[{0}]".format(srcName))
                        print("\t\t[{0}]".format(dstName))
                        continue

                    dstDir = setDir(mediaDir, dstName)
                    if isDir(dstDir):
                        print("{0} already exists".format(dstDir))
                        continue

                    print("\tMoving {0}  --->  {1}".format(srcDir, dstDir))
                    moveDir(srcDir, dstDir, debug=True)
예제 #25
0
    def parseFilms101YearlyData(self, ifile, debug=False):
        if debug:
            print(ifile)
        htmldata = getFile(ifile)
        bsdata   = getHTML(htmldata)
        
        movies = []
        
        headertables = bsdata.findAll("table", {"class": "lsthdg"})
        datatables   = bsdata.findAll("table", {"class": "lstdta"})
        if len(headertables) < len(datatables):
            print(headertables)
            raise ValueError("Found {0} headers and {1} data tables".format(len(headertables), len(datatables)))
            
        if debug:
            print("Found {0} tables".format(len(datatables)))
        for i in range(len(datatables)):
            headertable = headertables[i]
            tds         = headertable.findAll("td")
            headers     = [x.text for x in tds if x is not None]
            headers     = [x.strip() for x in headers]

            datatable   = datatables[i]
            trs         = datatable.findAll("tr")
            expect = len(trs)
            for tr in trs:
                tds = tr.findAll("td")
                tds = [x.text for x in tds if x is not None]
                if len(tds) != len(headers):
                    print(headers)
                    print(tds)
                    1/0

                try:
                    mdata = dict(zip(headers, tds))
                except:
                    print(headers)
                    print(tds)
                    raise ValueError("Could not combine headers and data")

                try:
                    movie = mdata['TITLE']
                except:
                    raise ValueError("Could not get movie name from TITLE key! {0}".format(mdata))

                movies.append(movie)
            
        if debug:
            print("Found {0}/{1} movies".format(len(movies), expect))
            
        return movies
예제 #26
0
파일: dbBase.py 프로젝트: tgadf/dbdata
    def getDBData(self, dbname, prefix, returnName=False, debug=False):
        savename = setFile(self.getDiscogDBDir(),
                           "{0}{1}.p".format(prefix, dbname))
        if self.debug is True:
            print("Data stored in {0}".format(savename))
        if returnName is True:
            return savename
        if not isFile(savename):
            raise ValueError("Could not find {0}".format(savename))

        if self.debug:
            print("Returning data from {0}".format(savename))
        data = getFile(savename, debug=debug)
        return data
예제 #27
0
    def getData(self, inputdata):
        if isinstance(inputdata, str):
            if isFile(inputdata):
                try:
                    bsdata = getHTML(getFile(inputdata))
                except:
                    try:
                        bsdata = getHTML(getFile(inputdata, version=2))
                    except:
                        raise ValueError("Cannot read artist file: {0}".format(inputdata))
            else:
                try:
                    bsdata = getHTML(inputdata)
                except:
                    raise ValueError("Not sure about string input: {0} . It is not a file".format(inputdata))
        elif isBS4(inputdata):
            bsdata = inputdata
            pass
        else:
            raise ValueError("Not sure about input type: {0}".format(type(inputdata)))

        self.bsdata = bsdata
        
        return self.parse()
예제 #28
0
 def getData(self):
     years = []
     for key in self.sources:
         resultsDir  = self.movieSource[key].getResultsDir()
         resultsName = self.movieSource[key].name
         filename = setFile(resultsDir, "{0}.json".format(resultsName))
         if isFile(filename):
             self.movieSourceData[key]  = getFile(filename)
             self.movieSourceYears[key] = list(self.movieSourceData[key].keys())
             print("Found {0} Years of {1} Movies".format(len(self.movieSourceYears[key]), key))
             years = years + self.movieSourceYears[key]
         else:
             raise ValueError("There is not results file: {0}".format(filename))
             
     
     self.years = sorted(list(set(years)))
     print("Found Data Between {0} and {1}".format(min(self.years), max(self.years)))
예제 #29
0
    def searchForArtist(self, artist):
        print("\n\n===================== Searching For {0} =====================".format(artist))
        url = self.getSearchArtistURL(artist)
        if url is None:
            raise ValueError("URL is None!")

        ## Download data
        data, response = self.downloadURL(url)
        if response != 200:
            print("Error downloading {0}".format(url))
            return False
        
        known = getFile(self.knownFile)
        print("  Found {0} previously searched for terms.".format(len(known)))
        known.append(artist)
        saveFile(idata=known, ifile=self.knownFile)

        self.parseSearchArtist(artist, data)
예제 #30
0
    def downloadTeamStatisticsData(self, debug=False):
        resultsDir = self.getSeasonResultsDir()
        files = findExt(resultsDir, ext=".p", debug=False)

        sleep(3)

        for ifile in files:
            seasonData = getFile(ifile)
            year = seasonData.getYear()
            gamesDir = self.getYearlyGamesDir(year)

            if year != 2014:
                continue

            teams = seasonData.teams
            for teamID, teamData in teams.items():
                name = teamData.teamName
                self.downloadTeamStatisticsDataByYear(teamID, name, year,
                                                      debug)