示例#1
0
def test_in_filter_rowgroups(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    df = pd.DataFrame({
        'x': range(10),
    })
    write(fn, df, row_group_offsets=2)
    pf = ParquetFile(fn)
    row_groups = list(pf.iter_row_groups(filters=[('x', 'in', [2])]))
    assert len(row_groups) == 1
    assert row_groups[0].x.tolist() == [2, 3]

    row_groups = list(pf.iter_row_groups(filters=[('x', 'in', [9])]))
    assert len(row_groups) == 1
    assert row_groups[0].x.tolist() == [8, 9]

    row_groups = list(pf.iter_row_groups(filters=[('x', 'in', [2, 9])]))
    assert len(row_groups) == 2
    assert row_groups[0].x.tolist() == [2, 3]
    assert row_groups[1].x.tolist() == [8, 9]
示例#2
0
def test_iter(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})
    df.index.name = 'index'

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], write_index=True)
    pf = ParquetFile(fn)
    out = iter(pf.iter_row_groups(index='index'))
    d1 = next(out)
    pd.util.testing.assert_frame_equal(d1, df[:2])
    d2 = next(out)
    pd.util.testing.assert_frame_equal(d2, df[2:])
    with pytest.raises(StopIteration):
        next(out)
示例#3
0
def test_iter(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})
    df.index.name = 'index'

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], write_index=True)
    pf = ParquetFile(fn)
    out = iter(pf.iter_row_groups(index='index'))
    d1 = next(out)
    pd.util.testing.assert_frame_equal(d1, df[:2])
    d2 = next(out)
    pd.util.testing.assert_frame_equal(d2, df[2:])
    with pytest.raises(StopIteration):
        next(out)
示例#4
0
 def incremental_train_with_parquet(self, parquet_path):
     print("Training incrementally with parquet...")
     nrows = 0
     pf = ParquetFile(parquet_path)
     classes, labels_freq = DataframePreprocessing(
         target_themes=self.target_themes).get_unique_binarized_labels(
             parquet_path, "tema", True)
     for df in pf.iter_row_groups():
         df = df.reset_index()
         self._update_dataframe(df,
                                is_parquet=True,
                                labels_freq=labels_freq)
         X_train, y_train = (
             self.df[self.x_column_name],
             self.df[self.target_themes + [self.other_themes_value]],
         )
         vector = self._vectorize(X_train)
         self.mo_classifier.partial_fit(vector.toarray(),
                                        y_train,
                                        classes=classes)
         nrows += len(self.df)
         print("{} rows already trained\n".format(nrows))
         clear_output(wait=True)
示例#5
0
from lenskit.batch import MultiEval
from lenskit.crossfold import partition_users, SampleN
from lenskit import batch, topn, util
from tf_idf import tf_idf

file = open("pairs_user_new.pickle", "rb")
pairs_user = pickle.load(file)

truth = pd.concat((p.test for p in pairs_user))

from fastparquet import ParquetFile

result = pd.DataFrame()
pf = ParquetFile('results/steam/pruned_5_new/recommendations.parquet')
for df in pf.iter_row_groups():
    trancate = df.loc[df['rank'] < 1001]
    result = result.append(trancate, sort=False)

#result.to_parquet('results/steam/pruned_5_new/recs.parquet')
#result.to_csv("results/steam/pruned_5_new/recs.csv")


def RR(rec, truth):
    #recs = pd.read_parquet(file_name)
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.recip_rank)
    RR_result = rla.compute(rec, truth)
    return RR_result