def test_in_filter_rowgroups(tempdir): fn = os.path.join(tempdir, 'test.parq') df = pd.DataFrame({ 'x': range(10), }) write(fn, df, row_group_offsets=2) pf = ParquetFile(fn) row_groups = list(pf.iter_row_groups(filters=[('x', 'in', [2])])) assert len(row_groups) == 1 assert row_groups[0].x.tolist() == [2, 3] row_groups = list(pf.iter_row_groups(filters=[('x', 'in', [9])])) assert len(row_groups) == 1 assert row_groups[0].x.tolist() == [8, 9] row_groups = list(pf.iter_row_groups(filters=[('x', 'in', [2, 9])])) assert len(row_groups) == 2 assert row_groups[0].x.tolist() == [2, 3] assert row_groups[1].x.tolist() == [8, 9]
def test_iter(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd']}) df.index.name = 'index' fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], write_index=True) pf = ParquetFile(fn) out = iter(pf.iter_row_groups(index='index')) d1 = next(out) pd.util.testing.assert_frame_equal(d1, df[:2]) d2 = next(out) pd.util.testing.assert_frame_equal(d2, df[2:]) with pytest.raises(StopIteration): next(out)
def incremental_train_with_parquet(self, parquet_path): print("Training incrementally with parquet...") nrows = 0 pf = ParquetFile(parquet_path) classes, labels_freq = DataframePreprocessing( target_themes=self.target_themes).get_unique_binarized_labels( parquet_path, "tema", True) for df in pf.iter_row_groups(): df = df.reset_index() self._update_dataframe(df, is_parquet=True, labels_freq=labels_freq) X_train, y_train = ( self.df[self.x_column_name], self.df[self.target_themes + [self.other_themes_value]], ) vector = self._vectorize(X_train) self.mo_classifier.partial_fit(vector.toarray(), y_train, classes=classes) nrows += len(self.df) print("{} rows already trained\n".format(nrows)) clear_output(wait=True)
from lenskit.batch import MultiEval from lenskit.crossfold import partition_users, SampleN from lenskit import batch, topn, util from tf_idf import tf_idf file = open("pairs_user_new.pickle", "rb") pairs_user = pickle.load(file) truth = pd.concat((p.test for p in pairs_user)) from fastparquet import ParquetFile result = pd.DataFrame() pf = ParquetFile('results/steam/pruned_5_new/recommendations.parquet') for df in pf.iter_row_groups(): trancate = df.loc[df['rank'] < 1001] result = result.append(trancate, sort=False) #result.to_parquet('results/steam/pruned_5_new/recs.parquet') #result.to_csv("results/steam/pruned_5_new/recs.csv") def RR(rec, truth): #recs = pd.read_parquet(file_name) rla = topn.RecListAnalysis() rla.add_metric(topn.recip_rank) RR_result = rla.compute(rec, truth) return RR_result