def test_batch(self): d = pyparade.Dataset(list(range(0, 1000)), name="Numbers") batches = d.batch(10).collect(name="Batch") #print(batches) self.assertEqual(len(batches), 100) self.assertEqual(batches[0][0], 0) self.assertEqual(batches[99][9], 999)
def test_error(self): def throw_error(value): if value == 3: raise ValueError(value) else: return value d = pyparade.Dataset([1, 2, 3, 4, 5, 6, 7, 8, 9], name="Number") self.assertRaises(ValueError, d.map(throw_error).collect)
def test_wordcount(self): text = pyparade.Dataset( ["abc test abc test test xyz", "abc test2 abc test cde xyz"]) words = text.flat_map( lambda line: [(word, 1) for word in re.split(" ", line)]) wordcounts = words.reduce_by_key(operator.add) result = wordcounts.collect(name="Counting words") correctResult = [("abc", 4), ("test", 4), ("xyz", 2), ("test2", 1), ("cde", 1)] self.assertEqual(len(result), len(correctResult)) for r in result: self.assertIn(r, correctResult)
def test_map(self): def slow_generator(): for i in range(0, 15): time.sleep(1 + 5 * random.random()) yield i def f(a): #print(str(a) + "->" + str(a+1)) time.sleep(0.0001) return a + 1 d = pyparade.Dataset(slow_generator(), length=15, name="Slowly generated dataset") inc = d.map(f, name="add 1", output_name="Numbers+1").collect(num_workers=4) equal = [i + 1 for i in range(0, 15)] self.assertEqual(sum(equal), sum(inc))
def test_group(self): d = pyparade.Dataset(list(range(0, 1000000))) def f(a): for i in range(0, 500): random.random() return ((a + 1) % 10, a + 1) def g(a): k, values = a return (k, sum(values) / len(values)) result = d.map(f).group_by_key().map(g).collect() for i in range(0, 10): self.assertEqual(result[i][0], i) self.assertTrue(abs(result[i][1] - 500000) <= 10)
def test_map(self): d = pyparade.Dataset( list(range(0, 100000)), name= "Numbers with a really extremly unnecessarly long dataset name for no reason" ) def f(a): #print(str(a) + "->" + str(a+1)) time.sleep(0.0001) return a + 1 def g(a): #print(str(a) + "->" + str(a+1)) time.sleep(0.001) return a + 1 inc = d.map(f, name="add 1", output_name="Numbers+1").map( g, name="add 1", output_name="Numbers+2").collect() equal = [(1 if a == b else 0) for a, b in zip(inc, list(range(2, 100002)))] self.assertEqual(sum(equal), 100000)
def test_fold(self): def f(a): #print(str(a) + "->" + str(a+1)) #time.sleep(0.001) for i in range(0, 1): random.random() time.sleep(0.01) return ((a + 1) % 100000, a + 1) def g(kv): for i in range(0, 5): random.random() time.sleep(0.01) k, v = kv return v result = pyparade.Dataset(list(range(0,10000)), name="Numbers") \ .map(f, name="calculate", output_name="Key/Value pairs") \ .map(g, name="take value", output_name="Values") \ .fold(0,operator.add,name="sum", output_name="Sum").collect(num_workers=4) self.assertEqual(result[0], sum(range(1, 10001)))
print "ERROR: Skipped placemark due to parsing error: ", e except Exception as e: print "ERROR: Skipped corrupted file " + path + ": ", e return trips def upload_trip(tripdata, cur): """Uploads a trip to the database. Args: tripdata: dictonary containing trip information""" #print tripdata cur.execute( "INSERT INTO %(trip)t (user_id, start_time, end_time, distance, activity, geom, start_geom, end_geom)\ SELECT %(user_id)s, %(start_time)s, %(end_time)s, %(distance)s, %(activity)s,\ ST_LineFromText(%(geom)s, 4326), ST_StartPoint(ST_LineFromText(%(geom)s, 4326)), ST_EndPoint(ST_LineFromText(%(geom)s, 4326))", tripdata) if __name__ == '__main__': files = glob.glob(KML_DIR + '/*/*.kml') print("Creating trips table...") with util.get_cursor() as cur: cur.execute(open("SQL/create_trip.sql", 'r').read()) print("Uploading data...") pyparade.Dataset(files).flat_map(extract_trips_from_kml).map( upload_trip, util.get_cursor).collect(description="Uploading trips")
single_trip_ids = [group for group in trip_group_ids if len(group) == 1] single_trip_ids = [item for sublist in single_trip_ids for item in sublist] #flatten merge_trip_ids = [group for group in trip_group_ids if len(group) > 1] #Insert single trips unchanged print(single_trip_ids) cur.execute("INSERT INTO %(trip_target)t SELECT * FROM %(trip)t WHERE id = ANY(%(trip_ids)s)", {"trip_ids": single_trip_ids}) #Merge grouped trips for group in merge_trip_ids: cur.execute(open(queries["merge_trips"],'r').read(), {"trip_ids": group}) return sum([len(group) for group in merge_trip_ids]) if __name__ == '__main__': print("Creating trips table...") with util.get_cursor() as cur: cur.execute(open(queries["create_trip"], 'r').read(), names = {"trip": config.NAMES["trip_target"]}) print("Uploading data...") user_ids = [] with util.get_cursor() as cur: cur.execute("SELECT user_id FROM %(user_list)t") user_ids = [r[0] for r in cur.fetchall()] merge_count = pyparade.Dataset(user_ids).map(merge_trips, util.get_cursor).fold(0, operator.add).collect(description="Merging trips") print "Merged " + str(merge_count[0]) + " trips."