def graph_features(self): if not self.graph_features: try: self.graph_features = SFrame.read_csv( f"{DATA_PATH}/bechdel_features.csv") except: t = triangles() self.graph_features = SFrame.read_csv( "../temp/graph_features.csv") self.graph_features = self.graph_features.join( SFrame(get_female_in_top_10_roles()), on={ "movie_name": "movie_name", "year": "year" }) self.graph_features = self.graph_features.join( SFrame(t), on={ "movie_name": "movie", "year": "year" }) self.graph_features["total_tri"] = self.graph_features["0"] + self.graph_features["1"] + \ self.graph_features["2"] + self.graph_features["3"] for i in range(4): self.graph_features[f"{i}%"] = self.graph_features[str( i)] / self.graph_features["total_tri"] self.graph_features.save(f"{DATA_PATH}/bechdel_features.csv", "csv") return self.graph_features
def rating(self): if self._rating is None: download_file(IMDB_RATING_URL, f"{OUTPUT_PATH}/title.ratings.tsv.gz", False) self._rating = SFrame.read_csv(f"{OUTPUT_PATH}/title.ratings.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) self._rating = self._rating.join(self.title) return self._rating
def get_relationship_triangles(): triangles = SFrame.read_csv(f"{OUTPUT_PATH}/triangles.csv", usecols=["0", "1", "2", "3", "4"]) triangles_gender = triangles.apply(lambda x: [ imdb_data.get_actor_gender(x["0"]), imdb_data.get_actor_gender(x["1"]), imdb_data.get_actor_gender(x["2"]) ]) triangles_gender = triangles_gender.unpack() triangles_gender["movie"] = triangles["3"] triangles_gender["year"] = triangles["4"] triangles_gender = triangles_gender.dropna() triangles_gender = triangles_gender.join(imdb_data.title, { "movie": "primaryTitle", "year": "startYear" }) triangles_gender["1"] = triangles_gender["X.0"] == "M" triangles_gender["2"] = triangles_gender["X.1"] == "M" triangles_gender["3"] = triangles_gender["X.2"] == "M" triangles_gender["total_men"] = triangles_gender["1"] + triangles_gender[ "2"] + triangles_gender["3"] triangles_gender["genres"] = triangles_gender["genres"].apply( lambda x: x.split(",")) return triangles_gender
def crew(self): if self._crew is None: download_file(IMDB_CREW_URL, f"{OUTPUT_PATH}/title.crew.tsv.gz", False) self._crew = SFrame.read_csv(f"{OUTPUT_PATH}/title.crew.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) self._crew["directors"] = self.crew["directors"].apply(lambda c: c.split(",")) self._crew = self._crew.stack("directors", "directors") return self._crew
def popular_actors(self): if self._actors is None: download_file(IMDB_PRINCIPALS_URL, f"{OUTPUT_PATH}/title.principals.tsv.gz", False) self._actors = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) self._actors = self._actors.filter_by(["actor", "actress"], "category")["tconst", "nconst"] self._actors = self._actors.join( self.rating[(self.rating["titleType"] == "movie") & (self.rating["numVotes"] > 1000)]) self._actors = self._actors.groupby("nconst", operations={'averageRating': agg.AVG("averageRating"), 'count': agg.COUNT()}) self._actors = self._actors.sort("averageRating", ascending=False) names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t") self._actors = self._actors.join(names) self._actors["gender"] = self._actors.apply(lambda p: self.add_actor_gender(p)) return self._actors
def actors_movies(self): if self._actors_movies is None: download_file(IMDB_PRINCIPALS_URL, f"{OUTPUT_PATH}/title.principals.tsv.gz", False) self._actors_movies = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) self._actors_movies = self._actors_movies.filter_by(["actor", "actress"], "category")[ "tconst", "nconst", "characters"] self._actors_movies = self._actors_movies.join(self.title[self.title["titleType"] == "movie"]) self._actors_movies = self._actors_movies.join(self.all_actors) return self._actors_movies
def paper_fields_of_study(self): """ Creating Keywords SFrame from.txt.gz files """ cols = ["PaperId", "FieldOfStudyId", "Score"] papaers_field = SFrame.read_csv("~/mag/PaperFieldsOfStudy.txt.gz", header=False, sep="\t") return papaers_field.rename( dict(zip([f"X{i+1}" for i in range(len(cols))], cols)))
def all_actors(self): if self._all_actors is None: download_file(IMDB_NAMES_URL, f"{OUTPUT_PATH}/name.basics.tsv.gz", False) self._all_actors = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) self._all_actors["primaryProfession"] = self._all_actors["primaryProfession"].apply(lambda x: x.split(",")) self._all_actors = self._all_actors.stack("primaryProfession", "primaryProfession") self._all_actors = self._all_actors.filter_by(["actor", "actress"], "primaryProfession") self._all_actors["gender"] = self._all_actors.apply(lambda p: self.add_actor_gender(p)) return self._all_actors
def references(self): """Creating the references SFrame from.txt.gz files""" references = SFrame.read_csv(str(self._dataset_dir / "PaperReferences.txt.gz"), header=False, delimiter="\t") references = references.rename({ "X1": "PaperId", "X2": "PaperReferenceId" }) return references
def papers_fields_of_study(self): """Creating the references SFrame from.txt.gz files""" fos = SFrame.read_csv(str(self._dataset_dir / "PapersFieldsOfStudy.txt.gz"), header=False, delimiter="\t") return references.rename({ "X1": "PaperId", "X2": "FieldOfStudyId", "X3": "Score" })
def __init__(self): self.bechdel = SFrame.read_csv(f"{DATA_PATH}/bechdel.csv", column_type_hints={"imdbid": str}) self.bechdel.sort("year", False) self.bechdel["tconst"] = "tt" + self.bechdel["imdbid"] self.bechdel_imdb = imdb_data.title.join(self.bechdel) self.clf = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_depth=5, random_state=1) self._graph_features = SFrame()
def field_of_study_children(self): """ Creates field of study hierarchy sframe from.txt.gz files """ h_sf = SFrame.read_csv(str(self._dataset_dir / "FieldOfStudyChildren.txt.gz"), header=False, delimiter="\t") return h_sf.rename({ "X1": "FieldOfStudyId", "X2": "ChildFieldOfStudyId" })
def generate_blacklist_roles(): firstnames = SFrame.read_csv(f"{DATA_PATH}/firstnames.csv", verbose=False)["Name"] surenames = SFrame.read_csv(f"{DATA_PATH}/surenames.csv", verbose=False)["name"] surenames = surenames.apply(lambda n: n.title()) sf = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", column_type_hints={"characters": list}, na_values=["\\N"]) sf = sf.filter_by(["actor", "actress"], "category")["tconst", "ordering", "characters", "nconst"] sf = sf.join(imdb_data.title[imdb_data.title["titleType"] == "movie"]) sf = sf.stack("characters", "character") sf["character"] = sf["character"].apply(lambda c: c.title()) sf.export_csv(f"{TEMP_PATH}/roles3.csv") whitelist = sf.groupby(key_column_names=['character', "nconst"], operations={'count': agg.COUNT()}) whitelist = whitelist[whitelist["count"] > 1]['character'] sf = sf.filter_by(whitelist, "character", True) sf = sf.groupby(key_column_names=['character'], operations={ 'ordering': agg.AVG("ordering"), 'count': agg.COUNT() }) sf["name"] = sf["character"].apply(lambda c: c.split(" ")[-1].strip()) sf = sf.filter_by(names.words(), "name", exclude=True) sf = sf.filter_by(surenames, "name", exclude=True) sf = sf.filter_by(firstnames, "name", exclude=True) sf = sf.sort("count", False) sf = sf[sf['ordering'] > 3] w = {x.replace("_", " ").title() for x in wordnet.words()} - set(names.words()) sf["set"] = sf["character"].apply(lambda x: x.split(" ")) sf["set"] = sf["set"].apply(lambda x: w & set(x)) sf = sf[sf['count'] > 11].append(sf[(sf['count'] > 1) & (sf['count'] < 10) & (sf["set"] != [])]) sf[["character"]].export_csv(f"{OUTPUT_PATH}/blacklist_roles.csv")
def createFrame(file): frame = SFrame.read_csv(args.labels + '/' + file, delimiter=' ', header=False) frame = frame.rename({ 'X1': 'name', 'X2': 'xMin', 'X3': 'yMin', 'X4': 'xMax', 'X5': 'yMax' }) frame['image'] = os.path.splitext(file)[0] return frame
def get_directors_data(self): rating = self.rating[self.rating["numVotes"] > 10000] sf = self.crew.join(rating) title = self.title[self.title["titleType"] == "movie"] sf = sf.join(title) sf = sf.groupby(key_column_names='directors', operations={'averageRating': agg.AVG("averageRating"), 'count': agg.COUNT()}) sf = sf[sf["count"] > 5] names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t") sf = sf.join(names, {"directors": "nconst"}) return sf.sort("averageRating", ascending=False)
def sjr_to_csv(self, regex): sjr_sf = SFrame() for p in self._dataset_dir.glob(regex): if p.suffix == ".csv": y = int(re.match(r'.*([1-3][0-9]{3})', p.name).group(1)) sf = SFrame.read_csv(str(p), delimiter=';') sf['Year'] = y sf = sf.rename({"Total Docs. (%s)" % y: "Total Docs."}) extra_cols = ["Categories"] for c in extra_cols: if c not in sf.column_names(): sf[c] = '' sjr_sf = sjr_sf.append(sf) r_issn = re.compile('(\\d{8})') sjr_sf['Issn'] = sjr_sf['Issn'].apply(lambda i: r_issn.findall(i)) return sjr_sf.stack('Issn', new_column_name='ISSN')
def papers(self): """ Create the Papers SFrame object from.txt.gz files which contains information on each paper """ cols = [ "PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", "EstimatedCitation", "OriginalVenue", "CreatedDate" ] papers = SFrame.read_csv(str(self._dataset_dir / "Papers.txt.gz"), header=False, sep="\t") papers = papers.rename( dict(zip([f"X{i+1}" for i in range(len(cols))], cols))) papers["Year"] = papers["Year"].astype(int) return papers
def get_bechdel_movies(): movies = SFrame.read_csv(f"{DATA_PATH}/bechdel_imdb.csv") movies = movies.sort("year", False) movies = movies.filter_by("movie", "titleType") generate_movies_graphs(movies)
zwsid = <REDACTED ADD YOUR OWN KEY> key = zwsid api = zillow.ValuationApi() def getSearchResults(key,row): try: address = row['ADDRESS'].strip() zipCode = row['ZIP CODE'] data = api.GetDeepSearchResults(key,address,zipCode) return data.get_dict() except: pass sf = SFrame.read_csv('trulia11566.csv', verbose=False) sf['zillowData'] = sf.apply(lambda row: getSearchResults(key,row)) sf = sf.unpack('zillowData').unpack('zillowData.zestimate') sf['ADDRESS', 'LOCALITY', 'STATE', 'ZIP CODE', 'COUNTY', 'STREET', 'TYPE', 'PRICE', 'zillowData.zestimate.amount', 'zillowData.zestimate.valuation_range_high',
def row_to_bbox_coordinates(row): """ Takes a row and returns a dictionary representing bounding box coordinates: (center_x, center_y, width, height) e.g. {'x': 100, 'y': 120, 'width': 80, 'height': 120} """ return { 'x': row['xMin'] + (row['xMax'] - row['xMin']) / 2, 'y': row['yMin'] + (row['yMax'] - row['yMin']) / 2, 'width': (row['xMax'] - row['xMin']), 'height': (row['yMax'] - row['yMin']) } sf = SFrame.read_csv(args.input) # rename columns to the input required for create ml sf = sf.rename({'name': 'label', 'image': 'imagefilename'}) # convert coordinates system to origin and size sf['coordinates'] = sf.apply(row_to_bbox_coordinates) # delete unused columns del sf['xMin'], sf['xMax'], sf['yMin'], sf['yMax'], sf['id'] # nest columns into a new column sf = sf.pack_columns(['label', 'coordinates'], new_column_name='bbox', dtype=dict)
def title(self): if self._title is None: download_file(IMDB_TITLES_URL, f"{OUTPUT_PATH}/title.basics.tsv.gz", False) self._title = SFrame.read_csv(f"{OUTPUT_PATH}/title.basics.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) return self._title