예제 #1
0
파일: SAG.py 프로젝트: tgadf/movies
    def parseSAGFilmData(self, table, category, debug=False):
        filmdata = {}
        
        ths = table.findAll("th")
        ths = [x.text for x in ths if x is not None]
        ths = [x.replace("\n", "") for x in ths]
        
        trs  = table.findAll("tr")
        
        
        year = None
        for i,tr in enumerate(trs[1:]):            
            tds = tr.findAll("td")
            #print(i,'\t',len(tds),'\t',tds[0])
            if len(tds) == 3:
                try:
                    year = removeTag(tds[0], 'a').text
                    year = year.replace("\n", "").strip()
                    int(year)
                    tds = tds[1:]
                except:
                    raise ValueError("Could not find year in {0}".format(tds[0]))
                    
            if len(tds) == 2:
                tds = [x.text for x in tds]
                tds = [x.replace("\n", "") for x in tds]
                tds = [x.strip() for x in tds]
                tds.insert(0, year)
            else:
                raise ValueError("Not sure what to do with this row: {0}".format(tds))
            
                
            try:
                row = dict(zip(ths, tds))
            except:
                raise ValueError("Could not zip: [{0}], [{1}]".format(ths, tds))

            if filmdata.get(year) is None:
                filmdata[year] = {}
            if filmdata[year].get(category) is None:
                filmdata[year][category] = []

            try:
                movie = row["Film"]
            except:
                raise ValueError("Cannot find movie in {0}".format(row))
                
            if movie.endswith(" ‡"):
                movie = movie[:-2]
            elif movie.endswith(" †"):
                movie = movie[:-2]


            filmdata[year][category].append(movie)
            

            if debug:
                print("{0: <10}{1: <75}{2}".format(year,category,movie))
                    
        return filmdata
예제 #2
0
    def getName(self):
        artistData = self.bsdata.find("h1", {"class": "artist_name_hdr"})
        if artistData is None:
            anc = artistDBNameClass(err="No H1")
            return anc

        span = artistData.find("span")
        if span is None:
            artistName = artistData.text.strip()
            artistNativeName = artistName
        else:
            artistName = span.text.strip()
            artistData = removeTag(artistData, span)
            artistNativeName = artistData.text.strip(
            )  #.replace(artistName, "").strip()

        if len(artistName) > 0:
            artistName = fixName(artistName)
            artistNativeName = fixName(artistNativeName)

            if artistName.endswith("]"):
                artistName = artistName.split(" [")[0].strip()
            if artistNativeName.endswith("]"):
                artistNativeName = artistNativeName.split(" [")[0].strip()

            anc = artistDBNameClass(name=artistName,
                                    native=artistNativeName,
                                    err=None)
        else:
            anc = artistDBNameClass(name=artistName, err="Fix")

        return anc
예제 #3
0
    def getAlsoKnownAs(self, tag):
        if tag is None:
            return None
#        {'tag': <div class="info_content"><span class="rendered_text">Dwight David Turner [birth name], <a class="artist" href="/artist/dwight_david" title="[Artist864564]">Dwight David</a>, Spider Turner</span></div>}
        span = tag.getTag().find("span", {"class": "rendered_text"})
        retval = []
        if span is not None:
            refs = span.findAll("a")
            for ref in refs:
                result = artistDBLinkClass(ref)
                retval.append(result)
                span = removeTag(span, ref)

            for result in span.text.split(","):
                retval.append(artistDBTextClass(result.strip()))
        else:
            refs = tag.getTag().findAll("a")
            if len(refs) == 0:
                try:
                    retval.append(artistDBTextClass(tag.getTag().strip()))
                except:
                    pass
            else:
                for ref in refs:
                    result = artistDBLinkClass(ref)
                    retval.append(result)
        return retval
예제 #4
0
    def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False):
        outdir = self.getDataDir()
        if procYear == None:
            files = findExt(outdir, ext=".p")
        else:
            files = findPatternExt(outdir, pattern=str(procYear), ext=".p")

        from collections import OrderedDict
        movieData = OrderedDict()
        for ifile in sorted(files):
            #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p"
            htmldata = getFile(ifile)
            bsdata = getHTML(htmldata)
            year = getBaseFilename(ifile)

            data = {}
            done = False
            tables = bsdata.findAll("table")  #, {"id": "table_3"})
            movies = {}
            for it, table in enumerate(tables):
                ths = table.findAll("th")
                trs = table.findAll("tr")
                for itr, tr in enumerate(trs):
                    tds = tr.findAll("td")
                    if len(tds) == 11:
                        val = removeTag(tds[1], 'span')
                        film = val.text
                        film = film.replace(" ({0})".format(year), "")
                        try:
                            rank = float(tds[-1].text)
                        except:
                            try:
                                rank = float(tds[-2].text)
                            except:
                                raise ValueError(tds[-1], tds[-2], tr)

                        movies[film] = rank

            movieData[year] = movies

        yearlyData = {}
        for year in sorted(movieData.keys()):
            yearlyData[year] = sorted(movieData[year].items(),
                                      key=operator.itemgetter(1),
                                      reverse=True)
            print("---->", year,
                  " (Top 5/{0} Movies) <----".format(len(yearlyData[year])))
            for item in yearlyData[year][:5]:
                print(item)
            print('\n')

        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format(
            len(yearlyData), savename))
        saveFile(savename, yearlyData)
예제 #5
0
파일: canada.py 프로젝트: tgadf/movies
    def parseCanadaFilmData(self, table, category, debug=False):
        filmdata = {}

        ths = table.findAll("th")
        ths = [x.text for x in ths if x is not None]
        ths = [x.replace("\n", "") for x in ths]

        trs = table.findAll("tr")
        year = None
        for i, tr in enumerate(trs[1:]):
            tds = tr.findAll("td")

            if len(tds) == 1:
                try:
                    tds = [removeTag(x, 'a') for x in tds]
                    year = tds[0].text
                    year = year.replace("Award presented in 1975", "")
                    year = int(year.strip())
                except:
                    continue

                    raise ValueError("Could not find year in {0}".format(tds))
                continue
            else:
                tds = [x.text for x in tds]
                tds = [x.replace("\n", "") for x in tds]
                tds = [x.strip() for x in tds]
                tds.insert(0, year)

            try:
                row = dict(zip(ths, tds))
            except:
                raise ValueError("Could not zip: [{0}], [{1}]".format(
                    ths, tds))

            try:
                movie = row["Film"]
            except:
                raise ValueError("Cannot find movie in {0}".format(row))

            if len(movie) == 0:
                continue

            if filmdata.get(year) is None:
                filmdata[year] = {}
            if filmdata[year].get(category) is None:
                filmdata[year][category] = []

            filmdata[year][category].append(movie)

            if debug:
                print("{0: <10}{1: <20}{2}".format(year, category, movie))

        return filmdata
예제 #6
0
    def getProfile(self):
        generalData = {}

        ##
        ## General
        ##
        metadata = self.bsdata.find("div", {"class": "metadata-and-wiki-row"})
        if metadata is not None:
            dls = metadata.findAll("dl")
            for dl in dls:
                dts = [dt.text for dt in dl.findAll("dt")]
                dds = dl.findAll("dd")
                attrKeys = dts
                attrVals = []
                for dd in dds:
                    refs = dd.findAll("a")
                    attrVals.append(
                        [artistDBTextClass(dd)] if len(refs) ==
                        0 else [artistDBLinkClass(ref) for ref in refs])
                dlData = dict(zip(attrKeys, attrVals))
                generalData["Metadata"] = dlData

        wikicolumns = self.bsdata.findAll("div", {"class": "wiki-column"})
        for wikicolumn in wikicolumns:
            wikiblocks = wikicolumn.findAll("div", {"class": "wiki-block"})
            for wikiblock in wikiblocks:
                refs = wikiblock.findAll("a")
                links = [artistDBLinkClass(ref)
                         for ref in refs] if (isinstance(refs, list)
                                              and len(refs) > 0) else None
                for ref in refs:
                    removeTag(wikiblock, ref)
                text = artistDBTextClass(wikiblock)
                if generalData.get("Wiki") is None:
                    generalData["Wiki"] = {"Text": [], "Refs": []}
                generalData["Wiki"]["Text"].append(text)
                for ref in refs:
                    generalData["Wiki"]["Refs"] += links
        if generalData.get("Wiki") is not None:
            keep = {(ref.href, ref.text): ref
                    for ref in generalData["Wiki"]["Refs"]}
            generalData["Wiki"]["Refs"] = list(keep.values())

        similarData = self.bsdata.find(
            "ol", {"class": "catalogue-overview-similar-artists-full-width"})
        similarData = self.bsdata.find(
            "section", {"class": "artist-similar-sidebar"
                        }) if similarData is None else similarData
        lis = similarData.findAll("li") if similarData is not None else None
        refs = [li.find("a", {"class": "link-block-target"})
                for li in lis] if lis is not None else None
        similarArtists = [artistDBLinkClass(ref)
                          for ref in refs] if (isinstance(refs, list)
                                               and len(refs) > 0) else None
        extraData = similarArtists

        ##
        ## Tags
        ##
        tags = self.bsdata.find("section", {"class": "catalogue-tags"})
        refs = tags.findAll("a") if tags is not None else None
        tagsData = [artistDBLinkClass(ref) for ref in refs
                    ] if (isinstance(refs, list) and len(refs) > 0) else None

        ##
        ## External
        ##
        external = self.bsdata.find("section",
                                    {"class": "external-links-section"})
        refs = external.findAll("a") if external is not None else None
        externalData = [artistDBLinkClass(ref)
                        for ref in refs] if (isinstance(refs, list)
                                             and len(refs) > 0) else None

        generalData = generalData if len(generalData) > 0 else None

        apc = artistDBProfileClass(general=generalData,
                                   tags=tagsData,
                                   extra=extraData,
                                   external=externalData)
        return apc