예제 #1
0
    def graph_features(self):
        if not self.graph_features:
            try:
                self.graph_features = SFrame.read_csv(
                    f"{DATA_PATH}/bechdel_features.csv")
            except:
                t = triangles()
                self.graph_features = SFrame.read_csv(
                    "../temp/graph_features.csv")

                self.graph_features = self.graph_features.join(
                    SFrame(get_female_in_top_10_roles()),
                    on={
                        "movie_name": "movie_name",
                        "year": "year"
                    })
                self.graph_features = self.graph_features.join(
                    SFrame(t), on={
                        "movie_name": "movie",
                        "year": "year"
                    })
                self.graph_features["total_tri"] = self.graph_features["0"] + self.graph_features["1"] + \
                                                   self.graph_features["2"] + self.graph_features["3"]
                for i in range(4):
                    self.graph_features[f"{i}%"] = self.graph_features[str(
                        i)] / self.graph_features["total_tri"]

                self.graph_features.save(f"{DATA_PATH}/bechdel_features.csv",
                                         "csv")
        return self.graph_features
예제 #2
0
def _get_sframes(features_train, features_test, labels_train, labels_test):
    logging.debug(f"turi._get_sframes()")

    train_data: pandas.DataFrame = features_train.join(labels_train)
    test_data: pandas.DataFrame = features_test.join(labels_test)

    train_data_sf = SFrame(data=train_data)
    test_data_sf = SFrame(data=test_data)

    return train_data_sf, test_data_sf
예제 #3
0
def similor_sort(sourceData, classicData, num):
    """
    :param sourceData: dataframe include
    :param classicData: classic picture
    :param num: how many picture to pick out
    :return:
    """
    start_time = time.time()

    ref_data = SFrame()
    for index, row in sourceData.iterrows():
        #print row
        path = row['path']
        img = tc.Image(path)
        ref_data = ref_data.append(SFrame({'path': [path], 'image': [img]}))
    ref_data = ref_data.add_row_number()

    # print ref_data

    query_data = SFrame()
    for index, row in classicData.iterrows():
        path = row['path']
        img = tc.Image(path)
        query_data = query_data.append(SFrame({
            'path': [path],
            'image': [img]
        }))
    query_data = query_data.add_row_number()

    model = tc.image_similarity.create(ref_data,
                                       label=None,
                                       feature=None,
                                       model='resnet-50',
                                       verbose=True)
    if num == 0:
        num = ref_data.num_rows()

    similar_images = model.query(query_data, k=num)

    ret_array = np.zeros((query_data.num_rows(), num))
    for image in similar_images:
        ref_label = image['reference_label']
        distance = image['distance']
        query_label = image['query_label']
        ret_array[query_label][ref_label] = distance

    mean = np.mean(ret_array, axis=0)
    sourceData.insert(2, 'distance', (mean))
    #sort = np.argsort(mean)
    # print sourceData

    elapsed_time = time.time() - start_time
    print("Time elapsed = %d" % (elapsed_time))
    return sourceData
예제 #4
0
 def __init__(self):
     self.imgframe = tc.load_sframe('model/final/final.sframe')
     self.model = tc.load_model('model/final/final_model')
     self.sample = tc.Image()
     self.results = SFrame()
     self.rows = SArray()
     self.pathlist = []
     self.distance_list = []
 def urls(self):
     """
     Creating URLs SFrame from.txt.gz files
     """
     cols = ["PaperId", "SourceType", "SourceUrl", "LanguageCode"]
     urls = SFrame(
         pd.read_csv(self._dataset_dir / "PaperUrls.txt.gz",
                     sep="\t",
                     names=cols).replace({pd.NA: None}))
     return urls.groupby("PaperId", {"Urls": agg.CONCAT("SourceUrl")})
예제 #6
0
 def __init__(self):
     self.bechdel = SFrame.read_csv(f"{DATA_PATH}/bechdel.csv",
                                    column_type_hints={"imdbid": str})
     self.bechdel.sort("year", False)
     self.bechdel["tconst"] = "tt" + self.bechdel["imdbid"]
     self.bechdel_imdb = imdb_data.title.join(self.bechdel)
     self.clf = RandomForestClassifier(n_jobs=-1,
                                       n_estimators=100,
                                       max_depth=5,
                                       random_state=1)
     self._graph_features = SFrame()
    def fields_of_study_papers_ids(self, levels=(1, 2, 3)):
        """
        Creates SFrames with each Fields of study PaperIds
        :param levels: list of fields of study level

        """

        sf = SFrame()
        for level in tqdm(levels):
            sf = sf.append(self._create_field_of_study_paper_ids(level))
        return sf
 def fields_of_study(self):
     """
     Creating Field of study SFrame from.txt.gz files
     """
     cols = [
         "FieldOfStudyId", "Rank", "NormalizedName", "DisplayName",
         "MainType", "Level", "PaperCount", "CitationCount", "CreatedDate"
     ]
     fields_of_study = SFrame(
         pd.read_csv(self._dataset_dir / "FieldsOfStudy.txt.gz",
                     sep="\t",
                     names=cols).replace({pd.NA: None}))
     return fields_of_study
 def paper_resources(self):
     """
     Creating Field of study SFrame from.txt.gz files
     ResourceType. 1 = Project, 2 = Data, 4 = Code
     """
     cols = [
         "PaperId", "ResourceType", "ResourceUrl", "SourceUrl",
         "RelationshipType"
     ]
     return SFrame(
         pd.read_csv(self._dataset_dir / "PaperResources.txt.gz",
                     sep="\t",
                     names=cols).replace({pd.NA: None}))
 def journals(self):
     """
     Create the Papers SFrame object from.txt.gz files which contains information on each paper
     """
     cols = [
         "JournalId", "Rank", "NormalizedName", "DisplayName", "Issn",
         "Publisher", "Webpage", "PaperCount", "CitationCount",
         "CreatedDate"
     ]
     journals = SFrame(
         pd.read_csv(self._dataset_dir / "Journals.txt.gz",
                     sep="\t",
                     names=cols).replace({pd.NA: None}))
     return journals
예제 #11
0
    def get_user_preferences(user_id, self):
        pref_df = self.get_data_frame('pref')
        comb_df = self.get_data_frame('comb')

        ratings_frame = SFrame(pd.merge(pref_df, comb_df, on='combination_id'))

        item_sim_model = item_similarity_recommender.create(
            ratings_frame,
            user_id='user_id',
            item_id='combination_id',
            target='rating',
            similarity_type='cosine')

        return item_sim_model.recommend(users=[].append(user_id),
                                        k=10).to_dataframe().values
    def paper_author_affiliations(self):
        """
        Creating authors affiliation SFrame from.txt.gz files
        :return:
        """
        cols = [
            "PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber",
            "OriginalAuthor", "OriginalAffiliation"
        ]
        paper_author_affiliations = SFrame(
            pd.read_csv(self._dataset_dir / "PaperAuthorAffiliations.txt.gz",
                        sep="\t",
                        names=cols).replace({pd.NA: None}))

        return paper_author_affiliations
    def affiliations(self):
        """
        Creating authors affiliation SFrame from.txt.gz files
        :return:
        """
        cols = [
            "AffiliationId", "Rank", "NormalizedName", "DisplayName", "GridId",
            "OfficialPage", "WikiPage", "PaperCount", "CitationCount",
            "CreatedDate"
        ]
        affiliations = SFrame(
            pd.read_csv(self._dataset_dir / "Affiliations.txt.gz",
                        sep="\t",
                        names=cols).replace({pd.NA: None}))

        return affiliations
예제 #14
0
    def sjr_to_csv(self, regex):
        sjr_sf = SFrame()
        for p in self._dataset_dir.glob(regex):
            if p.suffix == ".csv":
                y = int(re.match(r'.*([1-3][0-9]{3})', p.name).group(1))
                sf = SFrame.read_csv(str(p), delimiter=';')
                sf['Year'] = y
                sf = sf.rename({"Total Docs. (%s)" % y: "Total Docs."})
                extra_cols = ["Categories"]
                for c in extra_cols:
                    if c not in sf.column_names():
                        sf[c] = ''
                sjr_sf = sjr_sf.append(sf)

        r_issn = re.compile('(\\d{8})')
        sjr_sf['Issn'] = sjr_sf['Issn'].apply(lambda i: r_issn.findall(i))
        return sjr_sf.stack('Issn', new_column_name='ISSN')
 def authors(self):
     """
     Creates authors names SFrames from.txt.gz files
     """
     authors = SFrame(
         pd.read_csv(self._dataset_dir / "Authors.txt.gz",
                     sep="\t",
                     names=[
                         "AuthorId", "Rank", "NormalizedName",
                         "DisplayName", "LastKnownAffiliationId",
                         "PaperCount", "CitationCount", "CreatedDate"
                     ]).replace({pd.NA: None}))
     authors['First name'] = authors['NormalizedName'].apply(
         lambda s: s.split()[0])
     authors['Last name'] = authors['NormalizedName'].apply(
         lambda s: s.split()[-1])
     return authors
def sframe(frame):
    sf = SFrame(frame)
    sf.explore()
    sf.show()
    return
예제 #17
0
def _copy_from_sarray(sa, buf, start, end, field_length, bias=0):
    assert isinstance(sa, SArray)
    sf = SFrame({'__tmp__': sa})
    _copy_from_sframe(sf, buf, start, end, [field_length], bias)
예제 #18
0
def _copy_from_sarray(sa, buf, start, end, shape, bias=0):
    assert isinstance(sa, SArray)
    sf = SFrame({'__tmp__': sa})
    _copy_from_sframe(sf, buf, start, end, shape, bias)
예제 #19
0
    if multivariate:
        visualisation(y_pred=y_pred[:, 0].reshape(-1, 1),
                      Y_test=Y_test[:, 0].reshape(-1, 1),
                      y_difference=y_difference[:, 0].reshape(-1,
                                                              1))  # Insertion
        visualisation(y_pred=y_pred[:, 1].reshape(-1, 1),
                      Y_test=Y_test[:, 1].reshape(-1, 1),
                      y_difference=y_difference[:, 1].reshape(-1,
                                                              1))  # Deletion
    else:
        visualisation(y_pred, Y_test, y_difference)

    #Looking into sframe as an alternative to pandas, has s3 support
    #Tensorflow also has s3 and GCP support if you install from source and enable it
    sf = SFrame(build_csv(build_data(sample)[5], build_data(sample)[7]))
    sf.explore()
    sf.show()

else:
    #Neural Network architecture

    def build_regressor():

        #Initialising neural network
        regressor = Sequential()

        #Input layer and first hidden layer with dropout
        regressor.add(
            Dense(units=10,
                  kernel_initializer='uniform',
예제 #20
0
from turicreate import item_similarity_recommender, SFrame

import pandas as pd
# import numpy as np

data_path = '/home/riffel/Projects/vardiety/vardiety-recommendation/data'

combinations = pd.read_csv(data_path + '/combinations.csv', sep=',', names=['combination_id', 'items'])
preferences = pd.read_csv(data_path + '/preferences.csv', sep=',', names=['user_id', 'combination_id', 'rating'])

ratings = pd.merge(combinations, preferences, on='combination_id')

ratings_frame = SFrame(ratings)

print(ratings_frame)

item_sim_model = item_similarity_recommender.create(
    ratings_frame,
    user_id='user_id',
    item_id='combination_id',
    target='rating',
    similarity_type='pearson'
)

item_sim_recom = item_sim_model.recommend(users=[2], k=10)

print(item_sim_recom)

'''
n_users = ratings.user_id.unique().shape[0]
n_combinations = ratings.combination_id.unique().shape[0]
            coordinate = {
                'coordinates':
                coordinates_from_bounding_box(bounding_box, image_size),
                'label':
                class_name,
                'type':
                'rectangle'
            }
            sframe_entry.append(coordinate)
        return sframe_entry

    sframe_annotations = map(
        lambda kv: sframe_annotations_from_labelbox_annotations(kv),
        labelbox_annotations)

    # we need to flatten it to a list of dictionaries ... turicreate supports
    flat_sframe_annotations = [
        item for sublist in sframe_annotations for item in sublist
    ]

    return (img, flat_sframe_annotations)


rows = map(lambda annotated_image: row_from_annotated_image(annotated_image),
           total_annotated_images)
images, annotations = zip(*rows)

sframe = SFrame({"image": images, "annotations": annotations})
sframe.save(sframe_path)
print(repr(sframe))
예제 #22
0
 def first_name_gender(self):
     if self._first_name_gender is None:
         self._first_name_gender = SFrame(f"{DATA_PATH}/first_names_gender.sframe")
         self._first_name_gender = self._first_name_gender.unstack(["First Name", "Gender Dict"])[0][
             "Dict of First Name_Gender Dict"]
     return self._first_name_gender
sub_df = df[df.groupby('user').user.transform('count') > 20].copy()

#Getting a list of all unique users
users = sub_df['user'].unique()

# Splitting the users into train and test
users_train, users_test = train_test_split(users,
                                           test_size=0.02,
                                           random_state=42)

# Creating seperate dataframes for training users and testing users
train_df = sub_df[sub_df.user.isin(users_train)]
test_df = sub_df[sub_df.user.isin(users_test)]

test_df.columns = ['user', 'item', 'rating']
sf1 = SFrame(data=train_df)
sf2 = SFrame(data=test_df)

# Retaining some portion of the test users' data into training data for the Original RecSys
train, test = tc.recommender.util.random_split_by_user(sf2,
                                                       user_id='user',
                                                       item_id='item',
                                                       max_num_users=26788)

# Some portion of test user's data is added to the Original RecSys so that we can calculate their RMSE
final_sf = sf1 + train

test_users_train = train.to_dataframe()
test_users_eval = test.to_dataframe()

sample_of_users = test['user']