def graph_features(self): if not self.graph_features: try: self.graph_features = SFrame.read_csv( f"{DATA_PATH}/bechdel_features.csv") except: t = triangles() self.graph_features = SFrame.read_csv( "../temp/graph_features.csv") self.graph_features = self.graph_features.join( SFrame(get_female_in_top_10_roles()), on={ "movie_name": "movie_name", "year": "year" }) self.graph_features = self.graph_features.join( SFrame(t), on={ "movie_name": "movie", "year": "year" }) self.graph_features["total_tri"] = self.graph_features["0"] + self.graph_features["1"] + \ self.graph_features["2"] + self.graph_features["3"] for i in range(4): self.graph_features[f"{i}%"] = self.graph_features[str( i)] / self.graph_features["total_tri"] self.graph_features.save(f"{DATA_PATH}/bechdel_features.csv", "csv") return self.graph_features
def _get_sframes(features_train, features_test, labels_train, labels_test): logging.debug(f"turi._get_sframes()") train_data: pandas.DataFrame = features_train.join(labels_train) test_data: pandas.DataFrame = features_test.join(labels_test) train_data_sf = SFrame(data=train_data) test_data_sf = SFrame(data=test_data) return train_data_sf, test_data_sf
def similor_sort(sourceData, classicData, num): """ :param sourceData: dataframe include :param classicData: classic picture :param num: how many picture to pick out :return: """ start_time = time.time() ref_data = SFrame() for index, row in sourceData.iterrows(): #print row path = row['path'] img = tc.Image(path) ref_data = ref_data.append(SFrame({'path': [path], 'image': [img]})) ref_data = ref_data.add_row_number() # print ref_data query_data = SFrame() for index, row in classicData.iterrows(): path = row['path'] img = tc.Image(path) query_data = query_data.append(SFrame({ 'path': [path], 'image': [img] })) query_data = query_data.add_row_number() model = tc.image_similarity.create(ref_data, label=None, feature=None, model='resnet-50', verbose=True) if num == 0: num = ref_data.num_rows() similar_images = model.query(query_data, k=num) ret_array = np.zeros((query_data.num_rows(), num)) for image in similar_images: ref_label = image['reference_label'] distance = image['distance'] query_label = image['query_label'] ret_array[query_label][ref_label] = distance mean = np.mean(ret_array, axis=0) sourceData.insert(2, 'distance', (mean)) #sort = np.argsort(mean) # print sourceData elapsed_time = time.time() - start_time print("Time elapsed = %d" % (elapsed_time)) return sourceData
def __init__(self): self.imgframe = tc.load_sframe('model/final/final.sframe') self.model = tc.load_model('model/final/final_model') self.sample = tc.Image() self.results = SFrame() self.rows = SArray() self.pathlist = [] self.distance_list = []
def urls(self): """ Creating URLs SFrame from.txt.gz files """ cols = ["PaperId", "SourceType", "SourceUrl", "LanguageCode"] urls = SFrame( pd.read_csv(self._dataset_dir / "PaperUrls.txt.gz", sep="\t", names=cols).replace({pd.NA: None})) return urls.groupby("PaperId", {"Urls": agg.CONCAT("SourceUrl")})
def __init__(self): self.bechdel = SFrame.read_csv(f"{DATA_PATH}/bechdel.csv", column_type_hints={"imdbid": str}) self.bechdel.sort("year", False) self.bechdel["tconst"] = "tt" + self.bechdel["imdbid"] self.bechdel_imdb = imdb_data.title.join(self.bechdel) self.clf = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_depth=5, random_state=1) self._graph_features = SFrame()
def fields_of_study_papers_ids(self, levels=(1, 2, 3)): """ Creates SFrames with each Fields of study PaperIds :param levels: list of fields of study level """ sf = SFrame() for level in tqdm(levels): sf = sf.append(self._create_field_of_study_paper_ids(level)) return sf
def fields_of_study(self): """ Creating Field of study SFrame from.txt.gz files """ cols = [ "FieldOfStudyId", "Rank", "NormalizedName", "DisplayName", "MainType", "Level", "PaperCount", "CitationCount", "CreatedDate" ] fields_of_study = SFrame( pd.read_csv(self._dataset_dir / "FieldsOfStudy.txt.gz", sep="\t", names=cols).replace({pd.NA: None})) return fields_of_study
def paper_resources(self): """ Creating Field of study SFrame from.txt.gz files ResourceType. 1 = Project, 2 = Data, 4 = Code """ cols = [ "PaperId", "ResourceType", "ResourceUrl", "SourceUrl", "RelationshipType" ] return SFrame( pd.read_csv(self._dataset_dir / "PaperResources.txt.gz", sep="\t", names=cols).replace({pd.NA: None}))
def journals(self): """ Create the Papers SFrame object from.txt.gz files which contains information on each paper """ cols = [ "JournalId", "Rank", "NormalizedName", "DisplayName", "Issn", "Publisher", "Webpage", "PaperCount", "CitationCount", "CreatedDate" ] journals = SFrame( pd.read_csv(self._dataset_dir / "Journals.txt.gz", sep="\t", names=cols).replace({pd.NA: None})) return journals
def get_user_preferences(user_id, self): pref_df = self.get_data_frame('pref') comb_df = self.get_data_frame('comb') ratings_frame = SFrame(pd.merge(pref_df, comb_df, on='combination_id')) item_sim_model = item_similarity_recommender.create( ratings_frame, user_id='user_id', item_id='combination_id', target='rating', similarity_type='cosine') return item_sim_model.recommend(users=[].append(user_id), k=10).to_dataframe().values
def paper_author_affiliations(self): """ Creating authors affiliation SFrame from.txt.gz files :return: """ cols = [ "PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber", "OriginalAuthor", "OriginalAffiliation" ] paper_author_affiliations = SFrame( pd.read_csv(self._dataset_dir / "PaperAuthorAffiliations.txt.gz", sep="\t", names=cols).replace({pd.NA: None})) return paper_author_affiliations
def affiliations(self): """ Creating authors affiliation SFrame from.txt.gz files :return: """ cols = [ "AffiliationId", "Rank", "NormalizedName", "DisplayName", "GridId", "OfficialPage", "WikiPage", "PaperCount", "CitationCount", "CreatedDate" ] affiliations = SFrame( pd.read_csv(self._dataset_dir / "Affiliations.txt.gz", sep="\t", names=cols).replace({pd.NA: None})) return affiliations
def sjr_to_csv(self, regex): sjr_sf = SFrame() for p in self._dataset_dir.glob(regex): if p.suffix == ".csv": y = int(re.match(r'.*([1-3][0-9]{3})', p.name).group(1)) sf = SFrame.read_csv(str(p), delimiter=';') sf['Year'] = y sf = sf.rename({"Total Docs. (%s)" % y: "Total Docs."}) extra_cols = ["Categories"] for c in extra_cols: if c not in sf.column_names(): sf[c] = '' sjr_sf = sjr_sf.append(sf) r_issn = re.compile('(\\d{8})') sjr_sf['Issn'] = sjr_sf['Issn'].apply(lambda i: r_issn.findall(i)) return sjr_sf.stack('Issn', new_column_name='ISSN')
def authors(self): """ Creates authors names SFrames from.txt.gz files """ authors = SFrame( pd.read_csv(self._dataset_dir / "Authors.txt.gz", sep="\t", names=[ "AuthorId", "Rank", "NormalizedName", "DisplayName", "LastKnownAffiliationId", "PaperCount", "CitationCount", "CreatedDate" ]).replace({pd.NA: None})) authors['First name'] = authors['NormalizedName'].apply( lambda s: s.split()[0]) authors['Last name'] = authors['NormalizedName'].apply( lambda s: s.split()[-1]) return authors
def sframe(frame): sf = SFrame(frame) sf.explore() sf.show() return
def _copy_from_sarray(sa, buf, start, end, field_length, bias=0): assert isinstance(sa, SArray) sf = SFrame({'__tmp__': sa}) _copy_from_sframe(sf, buf, start, end, [field_length], bias)
def _copy_from_sarray(sa, buf, start, end, shape, bias=0): assert isinstance(sa, SArray) sf = SFrame({'__tmp__': sa}) _copy_from_sframe(sf, buf, start, end, shape, bias)
if multivariate: visualisation(y_pred=y_pred[:, 0].reshape(-1, 1), Y_test=Y_test[:, 0].reshape(-1, 1), y_difference=y_difference[:, 0].reshape(-1, 1)) # Insertion visualisation(y_pred=y_pred[:, 1].reshape(-1, 1), Y_test=Y_test[:, 1].reshape(-1, 1), y_difference=y_difference[:, 1].reshape(-1, 1)) # Deletion else: visualisation(y_pred, Y_test, y_difference) #Looking into sframe as an alternative to pandas, has s3 support #Tensorflow also has s3 and GCP support if you install from source and enable it sf = SFrame(build_csv(build_data(sample)[5], build_data(sample)[7])) sf.explore() sf.show() else: #Neural Network architecture def build_regressor(): #Initialising neural network regressor = Sequential() #Input layer and first hidden layer with dropout regressor.add( Dense(units=10, kernel_initializer='uniform',
from turicreate import item_similarity_recommender, SFrame import pandas as pd # import numpy as np data_path = '/home/riffel/Projects/vardiety/vardiety-recommendation/data' combinations = pd.read_csv(data_path + '/combinations.csv', sep=',', names=['combination_id', 'items']) preferences = pd.read_csv(data_path + '/preferences.csv', sep=',', names=['user_id', 'combination_id', 'rating']) ratings = pd.merge(combinations, preferences, on='combination_id') ratings_frame = SFrame(ratings) print(ratings_frame) item_sim_model = item_similarity_recommender.create( ratings_frame, user_id='user_id', item_id='combination_id', target='rating', similarity_type='pearson' ) item_sim_recom = item_sim_model.recommend(users=[2], k=10) print(item_sim_recom) ''' n_users = ratings.user_id.unique().shape[0] n_combinations = ratings.combination_id.unique().shape[0]
coordinate = { 'coordinates': coordinates_from_bounding_box(bounding_box, image_size), 'label': class_name, 'type': 'rectangle' } sframe_entry.append(coordinate) return sframe_entry sframe_annotations = map( lambda kv: sframe_annotations_from_labelbox_annotations(kv), labelbox_annotations) # we need to flatten it to a list of dictionaries ... turicreate supports flat_sframe_annotations = [ item for sublist in sframe_annotations for item in sublist ] return (img, flat_sframe_annotations) rows = map(lambda annotated_image: row_from_annotated_image(annotated_image), total_annotated_images) images, annotations = zip(*rows) sframe = SFrame({"image": images, "annotations": annotations}) sframe.save(sframe_path) print(repr(sframe))
def first_name_gender(self): if self._first_name_gender is None: self._first_name_gender = SFrame(f"{DATA_PATH}/first_names_gender.sframe") self._first_name_gender = self._first_name_gender.unstack(["First Name", "Gender Dict"])[0][ "Dict of First Name_Gender Dict"] return self._first_name_gender
sub_df = df[df.groupby('user').user.transform('count') > 20].copy() #Getting a list of all unique users users = sub_df['user'].unique() # Splitting the users into train and test users_train, users_test = train_test_split(users, test_size=0.02, random_state=42) # Creating seperate dataframes for training users and testing users train_df = sub_df[sub_df.user.isin(users_train)] test_df = sub_df[sub_df.user.isin(users_test)] test_df.columns = ['user', 'item', 'rating'] sf1 = SFrame(data=train_df) sf2 = SFrame(data=test_df) # Retaining some portion of the test users' data into training data for the Original RecSys train, test = tc.recommender.util.random_split_by_user(sf2, user_id='user', item_id='item', max_num_users=26788) # Some portion of test user's data is added to the Original RecSys so that we can calculate their RMSE final_sf = sf1 + train test_users_train = train.to_dataframe() test_users_eval = test.to_dataframe() sample_of_users = test['user']