def parseSAGFilmData(self, table, category, debug=False): filmdata = {} ths = table.findAll("th") ths = [x.text for x in ths if x is not None] ths = [x.replace("\n", "") for x in ths] trs = table.findAll("tr") year = None for i,tr in enumerate(trs[1:]): tds = tr.findAll("td") #print(i,'\t',len(tds),'\t',tds[0]) if len(tds) == 3: try: year = removeTag(tds[0], 'a').text year = year.replace("\n", "").strip() int(year) tds = tds[1:] except: raise ValueError("Could not find year in {0}".format(tds[0])) if len(tds) == 2: tds = [x.text for x in tds] tds = [x.replace("\n", "") for x in tds] tds = [x.strip() for x in tds] tds.insert(0, year) else: raise ValueError("Not sure what to do with this row: {0}".format(tds)) try: row = dict(zip(ths, tds)) except: raise ValueError("Could not zip: [{0}], [{1}]".format(ths, tds)) if filmdata.get(year) is None: filmdata[year] = {} if filmdata[year].get(category) is None: filmdata[year][category] = [] try: movie = row["Film"] except: raise ValueError("Cannot find movie in {0}".format(row)) if movie.endswith(" ‡"): movie = movie[:-2] elif movie.endswith(" †"): movie = movie[:-2] filmdata[year][category].append(movie) if debug: print("{0: <10}{1: <75}{2}".format(year,category,movie)) return filmdata
def getName(self): artistData = self.bsdata.find("h1", {"class": "artist_name_hdr"}) if artistData is None: anc = artistDBNameClass(err="No H1") return anc span = artistData.find("span") if span is None: artistName = artistData.text.strip() artistNativeName = artistName else: artistName = span.text.strip() artistData = removeTag(artistData, span) artistNativeName = artistData.text.strip( ) #.replace(artistName, "").strip() if len(artistName) > 0: artistName = fixName(artistName) artistNativeName = fixName(artistNativeName) if artistName.endswith("]"): artistName = artistName.split(" [")[0].strip() if artistNativeName.endswith("]"): artistNativeName = artistNativeName.split(" [")[0].strip() anc = artistDBNameClass(name=artistName, native=artistNativeName, err=None) else: anc = artistDBNameClass(name=artistName, err="Fix") return anc
def getAlsoKnownAs(self, tag): if tag is None: return None # {'tag': <div class="info_content"><span class="rendered_text">Dwight David Turner [birth name], <a class="artist" href="/artist/dwight_david" title="[Artist864564]">Dwight David</a>, Spider Turner</span></div>} span = tag.getTag().find("span", {"class": "rendered_text"}) retval = [] if span is not None: refs = span.findAll("a") for ref in refs: result = artistDBLinkClass(ref) retval.append(result) span = removeTag(span, ref) for result in span.text.split(","): retval.append(artistDBTextClass(result.strip())) else: refs = tag.getTag().findAll("a") if len(refs) == 0: try: retval.append(artistDBTextClass(tag.getTag().strip())) except: pass else: for ref in refs: result = artistDBLinkClass(ref) retval.append(result) return retval
def parseUltimateMovieRankingsYearlyData(self, procYear=None, debug=False): outdir = self.getDataDir() if procYear == None: files = findExt(outdir, ext=".p") else: files = findPatternExt(outdir, pattern=str(procYear), ext=".p") from collections import OrderedDict movieData = OrderedDict() for ifile in sorted(files): #ifile = "/Users/tgadfort/Documents/code/movies/ultimatemovierankings/data/2017.p" htmldata = getFile(ifile) bsdata = getHTML(htmldata) year = getBaseFilename(ifile) data = {} done = False tables = bsdata.findAll("table") #, {"id": "table_3"}) movies = {} for it, table in enumerate(tables): ths = table.findAll("th") trs = table.findAll("tr") for itr, tr in enumerate(trs): tds = tr.findAll("td") if len(tds) == 11: val = removeTag(tds[1], 'span') film = val.text film = film.replace(" ({0})".format(year), "") try: rank = float(tds[-1].text) except: try: rank = float(tds[-2].text) except: raise ValueError(tds[-1], tds[-2], tr) movies[film] = rank movieData[year] = movies yearlyData = {} for year in sorted(movieData.keys()): yearlyData[year] = sorted(movieData[year].items(), key=operator.itemgetter(1), reverse=True) print("---->", year, " (Top 5/{0} Movies) <----".format(len(yearlyData[year]))) for item in yearlyData[year][:5]: print(item) print('\n') savename = setFile(self.getResultsDir(), "{0}.json".format(self.name)) print("Saving {0} Years of Ultimate Movie Rankings data to {1}".format( len(yearlyData), savename)) saveFile(savename, yearlyData)
def parseCanadaFilmData(self, table, category, debug=False): filmdata = {} ths = table.findAll("th") ths = [x.text for x in ths if x is not None] ths = [x.replace("\n", "") for x in ths] trs = table.findAll("tr") year = None for i, tr in enumerate(trs[1:]): tds = tr.findAll("td") if len(tds) == 1: try: tds = [removeTag(x, 'a') for x in tds] year = tds[0].text year = year.replace("Award presented in 1975", "") year = int(year.strip()) except: continue raise ValueError("Could not find year in {0}".format(tds)) continue else: tds = [x.text for x in tds] tds = [x.replace("\n", "") for x in tds] tds = [x.strip() for x in tds] tds.insert(0, year) try: row = dict(zip(ths, tds)) except: raise ValueError("Could not zip: [{0}], [{1}]".format( ths, tds)) try: movie = row["Film"] except: raise ValueError("Cannot find movie in {0}".format(row)) if len(movie) == 0: continue if filmdata.get(year) is None: filmdata[year] = {} if filmdata[year].get(category) is None: filmdata[year][category] = [] filmdata[year][category].append(movie) if debug: print("{0: <10}{1: <20}{2}".format(year, category, movie)) return filmdata
def getProfile(self): generalData = {} ## ## General ## metadata = self.bsdata.find("div", {"class": "metadata-and-wiki-row"}) if metadata is not None: dls = metadata.findAll("dl") for dl in dls: dts = [dt.text for dt in dl.findAll("dt")] dds = dl.findAll("dd") attrKeys = dts attrVals = [] for dd in dds: refs = dd.findAll("a") attrVals.append( [artistDBTextClass(dd)] if len(refs) == 0 else [artistDBLinkClass(ref) for ref in refs]) dlData = dict(zip(attrKeys, attrVals)) generalData["Metadata"] = dlData wikicolumns = self.bsdata.findAll("div", {"class": "wiki-column"}) for wikicolumn in wikicolumns: wikiblocks = wikicolumn.findAll("div", {"class": "wiki-block"}) for wikiblock in wikiblocks: refs = wikiblock.findAll("a") links = [artistDBLinkClass(ref) for ref in refs] if (isinstance(refs, list) and len(refs) > 0) else None for ref in refs: removeTag(wikiblock, ref) text = artistDBTextClass(wikiblock) if generalData.get("Wiki") is None: generalData["Wiki"] = {"Text": [], "Refs": []} generalData["Wiki"]["Text"].append(text) for ref in refs: generalData["Wiki"]["Refs"] += links if generalData.get("Wiki") is not None: keep = {(ref.href, ref.text): ref for ref in generalData["Wiki"]["Refs"]} generalData["Wiki"]["Refs"] = list(keep.values()) similarData = self.bsdata.find( "ol", {"class": "catalogue-overview-similar-artists-full-width"}) similarData = self.bsdata.find( "section", {"class": "artist-similar-sidebar" }) if similarData is None else similarData lis = similarData.findAll("li") if similarData is not None else None refs = [li.find("a", {"class": "link-block-target"}) for li in lis] if lis is not None else None similarArtists = [artistDBLinkClass(ref) for ref in refs] if (isinstance(refs, list) and len(refs) > 0) else None extraData = similarArtists ## ## Tags ## tags = self.bsdata.find("section", {"class": "catalogue-tags"}) refs = tags.findAll("a") if tags is not None else None tagsData = [artistDBLinkClass(ref) for ref in refs ] if (isinstance(refs, list) and len(refs) > 0) else None ## ## External ## external = self.bsdata.find("section", {"class": "external-links-section"}) refs = external.findAll("a") if external is not None else None externalData = [artistDBLinkClass(ref) for ref in refs] if (isinstance(refs, list) and len(refs) > 0) else None generalData = generalData if len(generalData) > 0 else None apc = artistDBProfileClass(general=generalData, tags=tagsData, extra=extraData, external=externalData) return apc