def graph_features(self): if not self.graph_features: try: self.graph_features = SFrame.read_csv( f"{DATA_PATH}/bechdel_features.csv") except: t = triangles() self.graph_features = SFrame.read_csv( "../temp/graph_features.csv") self.graph_features = self.graph_features.join( SFrame(get_female_in_top_10_roles()), on={ "movie_name": "movie_name", "year": "year" }) self.graph_features = self.graph_features.join( SFrame(t), on={ "movie_name": "movie", "year": "year" }) self.graph_features["total_tri"] = self.graph_features["0"] + self.graph_features["1"] + \ self.graph_features["2"] + self.graph_features["3"] for i in range(4): self.graph_features[f"{i}%"] = self.graph_features[str( i)] / self.graph_features["total_tri"] self.graph_features.save(f"{DATA_PATH}/bechdel_features.csv", "csv") return self.graph_features
def urls(self): """ Creating URLs SFrame from.txt.gz files """ cols = ["PaperId", "SourceType", "SourceUrl", "LanguageCode"] urls = SFrame( pd.read_csv(self._dataset_dir / "PaperUrls.txt.gz", sep="\t", names=cols).replace({pd.NA: None})) return urls.groupby("PaperId", {"Urls": agg.CONCAT("SourceUrl")})
def _get_sframes(features_train, features_test, labels_train, labels_test): logging.debug(f"turi._get_sframes()") train_data: pandas.DataFrame = features_train.join(labels_train) test_data: pandas.DataFrame = features_test.join(labels_test) train_data_sf = SFrame(data=train_data) test_data_sf = SFrame(data=test_data) return train_data_sf, test_data_sf
def __init__(self): self.bechdel = SFrame.read_csv(f"{DATA_PATH}/bechdel.csv", column_type_hints={"imdbid": str}) self.bechdel.sort("year", False) self.bechdel["tconst"] = "tt" + self.bechdel["imdbid"] self.bechdel_imdb = imdb_data.title.join(self.bechdel) self.clf = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_depth=5, random_state=1) self._graph_features = SFrame()
def fields_of_study_papers_ids(self, levels=(1, 2, 3)): """ Creates SFrames with each Fields of study PaperIds :param levels: list of fields of study level """ sf = SFrame() for level in tqdm(levels): sf = sf.append(self._create_field_of_study_paper_ids(level)) return sf
def rating(self): if self._rating is None: download_file(IMDB_RATING_URL, f"{OUTPUT_PATH}/title.ratings.tsv.gz", False) self._rating = SFrame.read_csv(f"{OUTPUT_PATH}/title.ratings.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) self._rating = self._rating.join(self.title) return self._rating
def data(self): """ Create AMiner Papers sFrame from the AMiner text files. After creating the SFrame, it is saved to the disk """ return SFrame.read_json(self._dataset_dir.joinpath("AMiner/*.txt"), orient='lines')
def get_relationship_triangles(): triangles = SFrame.read_csv(f"{OUTPUT_PATH}/triangles.csv", usecols=["0", "1", "2", "3", "4"]) triangles_gender = triangles.apply(lambda x: [ imdb_data.get_actor_gender(x["0"]), imdb_data.get_actor_gender(x["1"]), imdb_data.get_actor_gender(x["2"]) ]) triangles_gender = triangles_gender.unpack() triangles_gender["movie"] = triangles["3"] triangles_gender["year"] = triangles["4"] triangles_gender = triangles_gender.dropna() triangles_gender = triangles_gender.join(imdb_data.title, { "movie": "primaryTitle", "year": "startYear" }) triangles_gender["1"] = triangles_gender["X.0"] == "M" triangles_gender["2"] = triangles_gender["X.1"] == "M" triangles_gender["3"] = triangles_gender["X.2"] == "M" triangles_gender["total_men"] = triangles_gender["1"] + triangles_gender[ "2"] + triangles_gender["3"] triangles_gender["genres"] = triangles_gender["genres"].apply( lambda x: x.split(",")) return triangles_gender
def sjr_to_csv(self, regex): sjr_sf = SFrame() for p in self._dataset_dir.glob(regex): if p.suffix == ".csv": y = int(re.match(r'.*([1-3][0-9]{3})', p.name).group(1)) sf = SFrame.read_csv(str(p), delimiter=';') sf['Year'] = y sf = sf.rename({"Total Docs. (%s)" % y: "Total Docs."}) extra_cols = ["Categories"] for c in extra_cols: if c not in sf.column_names(): sf[c] = '' sjr_sf = sjr_sf.append(sf) r_issn = re.compile('(\\d{8})') sjr_sf['Issn'] = sjr_sf['Issn'].apply(lambda i: r_issn.findall(i)) return sjr_sf.stack('Issn', new_column_name='ISSN')
def __init__(self): self.imgframe = tc.load_sframe('model/final/final.sframe') self.model = tc.load_model('model/final/final_model') self.sample = tc.Image() self.results = SFrame() self.rows = SArray() self.pathlist = [] self.distance_list = []
def crew(self): if self._crew is None: download_file(IMDB_CREW_URL, f"{OUTPUT_PATH}/title.crew.tsv.gz", False) self._crew = SFrame.read_csv(f"{OUTPUT_PATH}/title.crew.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) self._crew["directors"] = self.crew["directors"].apply(lambda c: c.split(",")) self._crew = self._crew.stack("directors", "directors") return self._crew
def popular_actors(self): if self._actors is None: download_file(IMDB_PRINCIPALS_URL, f"{OUTPUT_PATH}/title.principals.tsv.gz", False) self._actors = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) self._actors = self._actors.filter_by(["actor", "actress"], "category")["tconst", "nconst"] self._actors = self._actors.join( self.rating[(self.rating["titleType"] == "movie") & (self.rating["numVotes"] > 1000)]) self._actors = self._actors.groupby("nconst", operations={'averageRating': agg.AVG("averageRating"), 'count': agg.COUNT()}) self._actors = self._actors.sort("averageRating", ascending=False) names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t") self._actors = self._actors.join(names) self._actors["gender"] = self._actors.apply(lambda p: self.add_actor_gender(p)) return self._actors
def all_actors(self): if self._all_actors is None: download_file(IMDB_NAMES_URL, f"{OUTPUT_PATH}/name.basics.tsv.gz", False) self._all_actors = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) self._all_actors["primaryProfession"] = self._all_actors["primaryProfession"].apply(lambda x: x.split(",")) self._all_actors = self._all_actors.stack("primaryProfession", "primaryProfession") self._all_actors = self._all_actors.filter_by(["actor", "actress"], "primaryProfession") self._all_actors["gender"] = self._all_actors.apply(lambda p: self.add_actor_gender(p)) return self._all_actors
def paper_fields_of_study(self): """ Creating Keywords SFrame from.txt.gz files """ cols = ["PaperId", "FieldOfStudyId", "Score"] papaers_field = SFrame.read_csv("~/mag/PaperFieldsOfStudy.txt.gz", header=False, sep="\t") return papaers_field.rename( dict(zip([f"X{i+1}" for i in range(len(cols))], cols)))
def actors_movies(self): if self._actors_movies is None: download_file(IMDB_PRINCIPALS_URL, f"{OUTPUT_PATH}/title.principals.tsv.gz", False) self._actors_movies = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", na_values=["\\N"], verbose=self._verbose) self._actors_movies = self._actors_movies.filter_by(["actor", "actress"], "category")[ "tconst", "nconst", "characters"] self._actors_movies = self._actors_movies.join(self.title[self.title["titleType"] == "movie"]) self._actors_movies = self._actors_movies.join(self.all_actors) return self._actors_movies
def references(self): """Creating the references SFrame from.txt.gz files""" references = SFrame.read_csv(str(self._dataset_dir / "PaperReferences.txt.gz"), header=False, delimiter="\t") references = references.rename({ "X1": "PaperId", "X2": "PaperReferenceId" }) return references
def papers_fields_of_study(self): """Creating the references SFrame from.txt.gz files""" fos = SFrame.read_csv(str(self._dataset_dir / "PapersFieldsOfStudy.txt.gz"), header=False, delimiter="\t") return references.rename({ "X1": "PaperId", "X2": "FieldOfStudyId", "X3": "Score" })
def field_of_study_children(self): """ Creates field of study hierarchy sframe from.txt.gz files """ h_sf = SFrame.read_csv(str(self._dataset_dir / "FieldOfStudyChildren.txt.gz"), header=False, delimiter="\t") return h_sf.rename({ "X1": "FieldOfStudyId", "X2": "ChildFieldOfStudyId" })
def fields_of_study(self): """ Creating Field of study SFrame from.txt.gz files """ cols = [ "FieldOfStudyId", "Rank", "NormalizedName", "DisplayName", "MainType", "Level", "PaperCount", "CitationCount", "CreatedDate" ] fields_of_study = SFrame( pd.read_csv(self._dataset_dir / "FieldsOfStudy.txt.gz", sep="\t", names=cols).replace({pd.NA: None})) return fields_of_study
def paper_resources(self): """ Creating Field of study SFrame from.txt.gz files ResourceType. 1 = Project, 2 = Data, 4 = Code """ cols = [ "PaperId", "ResourceType", "ResourceUrl", "SourceUrl", "RelationshipType" ] return SFrame( pd.read_csv(self._dataset_dir / "PaperResources.txt.gz", sep="\t", names=cols).replace({pd.NA: None}))
def generate_blacklist_roles(): firstnames = SFrame.read_csv(f"{DATA_PATH}/firstnames.csv", verbose=False)["Name"] surenames = SFrame.read_csv(f"{DATA_PATH}/surenames.csv", verbose=False)["name"] surenames = surenames.apply(lambda n: n.title()) sf = SFrame.read_csv(f"{OUTPUT_PATH}/title.principals.tsv.gz", delimiter="\t", column_type_hints={"characters": list}, na_values=["\\N"]) sf = sf.filter_by(["actor", "actress"], "category")["tconst", "ordering", "characters", "nconst"] sf = sf.join(imdb_data.title[imdb_data.title["titleType"] == "movie"]) sf = sf.stack("characters", "character") sf["character"] = sf["character"].apply(lambda c: c.title()) sf.export_csv(f"{TEMP_PATH}/roles3.csv") whitelist = sf.groupby(key_column_names=['character', "nconst"], operations={'count': agg.COUNT()}) whitelist = whitelist[whitelist["count"] > 1]['character'] sf = sf.filter_by(whitelist, "character", True) sf = sf.groupby(key_column_names=['character'], operations={ 'ordering': agg.AVG("ordering"), 'count': agg.COUNT() }) sf["name"] = sf["character"].apply(lambda c: c.split(" ")[-1].strip()) sf = sf.filter_by(names.words(), "name", exclude=True) sf = sf.filter_by(surenames, "name", exclude=True) sf = sf.filter_by(firstnames, "name", exclude=True) sf = sf.sort("count", False) sf = sf[sf['ordering'] > 3] w = {x.replace("_", " ").title() for x in wordnet.words()} - set(names.words()) sf["set"] = sf["character"].apply(lambda x: x.split(" ")) sf["set"] = sf["set"].apply(lambda x: w & set(x)) sf = sf[sf['count'] > 11].append(sf[(sf['count'] > 1) & (sf['count'] < 10) & (sf["set"] != [])]) sf[["character"]].export_csv(f"{OUTPUT_PATH}/blacklist_roles.csv")
def createFrame(file): frame = SFrame.read_csv(args.labels + '/' + file, delimiter=' ', header=False) frame = frame.rename({ 'X1': 'name', 'X2': 'xMin', 'X3': 'yMin', 'X4': 'xMax', 'X5': 'yMax' }) frame['image'] = os.path.splitext(file)[0] return frame
def journals(self): """ Create the Papers SFrame object from.txt.gz files which contains information on each paper """ cols = [ "JournalId", "Rank", "NormalizedName", "DisplayName", "Issn", "Publisher", "Webpage", "PaperCount", "CitationCount", "CreatedDate" ] journals = SFrame( pd.read_csv(self._dataset_dir / "Journals.txt.gz", sep="\t", names=cols).replace({pd.NA: None})) return journals
def paper_author_affiliations(self): """ Creating authors affiliation SFrame from.txt.gz files :return: """ cols = [ "PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber", "OriginalAuthor", "OriginalAffiliation" ] paper_author_affiliations = SFrame( pd.read_csv(self._dataset_dir / "PaperAuthorAffiliations.txt.gz", sep="\t", names=cols).replace({pd.NA: None})) return paper_author_affiliations
def get_user_preferences(user_id, self): pref_df = self.get_data_frame('pref') comb_df = self.get_data_frame('comb') ratings_frame = SFrame(pd.merge(pref_df, comb_df, on='combination_id')) item_sim_model = item_similarity_recommender.create( ratings_frame, user_id='user_id', item_id='combination_id', target='rating', similarity_type='cosine') return item_sim_model.recommend(users=[].append(user_id), k=10).to_dataframe().values
def create_and_save_model(data: turicreate.SFrame): """ Creates the CoreML model using the data SFrame, and saves it to the current directory. :param data: The SFrame that was created using the training data """ train_data, test_data = data.random_split(TRAIN_TEST_SPLIT) model = turicreate.one_shot_object_detector.create(train_data, target=CARD_NAME_LABEL, batch_size=32) _ = model.predict(test_data) print("Ran model.predict") metrics = model.evaluate(test_data) print(metrics[ACCURACY_LABEL]) model.save(MODEL_NAME) model.export_coreml(COREML_MODEL_NAME)
def affiliations(self): """ Creating authors affiliation SFrame from.txt.gz files :return: """ cols = [ "AffiliationId", "Rank", "NormalizedName", "DisplayName", "GridId", "OfficialPage", "WikiPage", "PaperCount", "CitationCount", "CreatedDate" ] affiliations = SFrame( pd.read_csv(self._dataset_dir / "Affiliations.txt.gz", sep="\t", names=cols).replace({pd.NA: None})) return affiliations
def get_directors_data(self): rating = self.rating[self.rating["numVotes"] > 10000] sf = self.crew.join(rating) title = self.title[self.title["titleType"] == "movie"] sf = sf.join(title) sf = sf.groupby(key_column_names='directors', operations={'averageRating': agg.AVG("averageRating"), 'count': agg.COUNT()}) sf = sf[sf["count"] > 5] names = SFrame.read_csv(f"{OUTPUT_PATH}/name.basics.tsv.gz", delimiter="\t") sf = sf.join(names, {"directors": "nconst"}) return sf.sort("averageRating", ascending=False)
def authors(self): """ Creates authors names SFrames from.txt.gz files """ authors = SFrame( pd.read_csv(self._dataset_dir / "Authors.txt.gz", sep="\t", names=[ "AuthorId", "Rank", "NormalizedName", "DisplayName", "LastKnownAffiliationId", "PaperCount", "CitationCount", "CreatedDate" ]).replace({pd.NA: None})) authors['First name'] = authors['NormalizedName'].apply( lambda s: s.split()[0]) authors['Last name'] = authors['NormalizedName'].apply( lambda s: s.split()[-1]) return authors
def papers(self): """ Create the Papers SFrame object from.txt.gz files which contains information on each paper """ cols = [ "PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", "EstimatedCitation", "OriginalVenue", "CreatedDate" ] papers = SFrame.read_csv(str(self._dataset_dir / "Papers.txt.gz"), header=False, sep="\t") papers = papers.rename( dict(zip([f"X{i+1}" for i in range(len(cols))], cols))) papers["Year"] = papers["Year"].astype(int) return papers