示例#1
0
    def time_merge_2intkey(self):

        result, orizer, intrizer, leftsorter, leftcount = pipeline_merge(
            self.left, self.right[2], slices=10, how="pipeline_merge")
        #print("*&%&%*$&%*$&*%")
        #print(leftsorter)
        #print(leftcount)
        #print("*&%&%*$&%*$&*%")
        for i in range(3, self.pieces):
            result, orizer, intrizer, leftsorter, leftcount = pipeline_merge(
                self.left,
                self.right[i],
                factorizer=orizer,
                intfactorizer=intrizer,
                leftsorter=leftsorter,
                leftcount=leftcount,
                slices=10,
                how="pipeline_merge")
示例#2
0
 def time_merge_2intkey(self):
     result = pipeline_merge(self.left, self.right, how="pipeline")
示例#3
0
    def run_command(key):
        pywren.wrenlogging.default_config('INFO')
        logging.basicConfig(level=logging.DEBUG)
        logger = logging.getLogger(__name__)
        logger.info("before everything")
        partition_num = key['partition_num']
        rounds = key['rounds']
        em = JiffyClient(host=key['em'])
        reduceId = key['taskId']
        appName = key['appName']
        alg_type = key['type']
        data_ques1 = open_or_create_jiffy_queues(em, appName, partition_num, 1,
                                                 'receiver')
        logger.info("queue opened")
        names = key['names']
        dtypes = key['dtypes']
        ############# left table
        left_table = 100000
        indices = tm.makeStringIndex(left_table).values
        key = np.tile(indices[:left_table], 1)
        left = DataFrame({"key": key, "value": np.random.randn(left_table)})
        t_start = time.time()
        ############### initialize join functions
        # print(left)
        lim = 0
        ############## keeps fetching
        fin_num = 0

        if alg_type == 'pipelined':
            leftsorter = None
            leftcount = None
            orizer = None
            intrizer = None
            count = 0

            while fin_num < partition_num and lim < 15:
                #### read table
                lim += 1
                time.sleep(0.01)
                logger.info("before get")
                obj = data_ques1[0].get()
                if sys.getsizeof(obj) > 1000:
                    part_data = pd.read_table(BytesIO(obj),
                                              header=None,
                                              delimiter="|",
                                              names=['key', 'value2'])

            #    ds, fin_num = read_jiffy_splits(names, dtypes, reduceId, data_ques1, fin_num, batch_size = 1, fin_size = partition_num)
                logger.info(ds)
                logger.info(fin_num)
                #                 print(fin_num)
                if len(ds) > 0:
                    ### join
                    #                 start = timeit.default_timer()
                    result, orizer, intrizer, leftsorter, leftcount = pipeline_merge(
                        left,
                        ds,
                        factorizer=orizer,
                        intfactorizer=intrizer,
                        leftsorter=leftsorter,
                        leftcount=leftcount,
                        slices=8,
                        how="pipeline")
                    time.sleep(0.8)
                    logger.info("merged")

    #                 end = timeit.default_timer()
    #                 count += (end - start)
    #                 logger.info(str(i) + " chunks take time " +  str(end - start) + " Accum time: " + str(count))

        elif alg_type == 'origin':
            ds = pd.DataFrame()
            while fin_num < partition_num and lim < 1500:
                lim += 1
                #### read table
                dd, fin_num = read_jiffy_splits(names,
                                                dtypes,
                                                reduceId,
                                                data_ques1,
                                                fin_num,
                                                batch_size=1,
                                                fin_size=partition_num)
                if len(dd) > 0:

                    ds = ds.append(dd)
                print("this is ds:")
                print(ds)
                result = merge(left, ds, how="inner")
                print(fin_num)
        t_fin = time.time()
        #         share.append([t_start,t_fin, fin_num])
        return ([t_fin, t_start])
示例#4
0
    'key': ['K5', 'K7', 'K5', 'K50'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})
right = pd.DataFrame({
    'key': ['K9', 'K23', 'K5', 'K7'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
})
right2 = pd.DataFrame({
    'key': ['K0', 'K6', 'K7', 'K3'],
    'C': ['C9', 'C1', 'C2', 'C8'],
    'D': ['D9', 'D1', 'D2', 'D8']
})

result, objectrizer, intrizer, leftsorter, leftcount = pd.pipeline_merge(
    left, right, how='pipeline')

print(result)

result, objectrizer, intrizer, leftsorter, leftcount = pd.pipeline_merge(
    left,
    right2,
    factorizer=objectrizer,
    intfactorizer=intrizer,
    leftsorter=leftsorter,
    leftcount=leftcount,
    how='pipeline')

print(result)
示例#5
0
except ImportError:
    from pandas import ordered_merge as merge_ordered

N = 10000000
pieces = 10
indices = tm.makeStringIndex(N).values
indices2 = tm.makeStringIndex(N).values
key = np.tile(indices[:500000], 1)
key2 = np.tile(indices2[:500000], 1)
left = DataFrame({"key": key, "value": np.random.randn(500000)})
right = {}
right = DataFrame({
    "key": indices[1 * 100000 + 50000:9 * 100000 + 50000],
    "value2": np.random.randn(800000),
})
result = pipeline_merge(left, right, how="pipeline")

#    def time_merge_dataframe_integer_2key(self, sort):
#        pipeline_merge(self.df, self.df3, how="pipeline")
#
#    def time_merge_dataframe_integer_key(self, sort):
#        pipeline_merge(self.df, self.df2, on="key1", how="pipeline")

# class I8Merge:
#
#     params = ["inner", "outer", "left", "right"]
#     param_names = ["how"]
#
#     def setup(self, how):
#         low, high, n = -1000, 1000, 10 ** 6
#         self.left = DataFrame(
示例#6
0
print("\n")

logger.info("Start Running test for pipelined pandas code")

leftsorter = None
leftcount = None
orizer = None
intrizer = None
count = 0
for i in range(1, pieces):
    start = timeit.default_timer()
    result, orizer, intrizer, leftsorter, leftcount = pipeline_merge(
        left,
        right[i],
        factorizer=orizer,
        intfactorizer=intrizer,
        leftsorter=leftsorter,
        leftcount=leftcount,
        slices=ttt - 1,
        how="pipeline")
    end = timeit.default_timer()
    count += (end - start)
    logger.info(
        str(i) + " chunks take time " + str(end - start) + " Accum time: " +
        str(count))
#print("******* ", end - start)

print(
    "--------------------------------------------------------------------------------"
)
print("\n")
示例#7
0
文件: 1.py 项目: charles-typ/pandas
right = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3', 'K5', 'K7'],
    'C': ['C0', 'C1', 'C2', 'C3', 'C4', 'C5'],
    'D': ['D0', 'D1', 'D2', 'D3', 'D4', 'D5']
})

right_2 = pd.DataFrame({
    'key': ['K5', 'K4', 'K3', 'K5', 'K6', 'K9'],
    'C': ['CC0', 'CC1', 'CC2', 'CC3', 'CC4', 'CC5'],
    'D': ['DD0', 'DD1', 'DD2', 'DD3', 'DD4', 'DD5']
})
left_full = pd.concat([left, left_2])
right_full = pd.concat([right, right_2])
result, orizer, intrizer, sorter, count = pd.pipeline_merge(left_full,
                                                            right_full,
                                                            slices=1,
                                                            how="pipeline")
print(result)

keys, fac, intfac = pd.build_hash_table(left['key'])
keys, fac, intfac = pd.build_hash_table(left_2['key'],
                                        factorizer=fac,
                                        intfactorizer=intfac,
                                        previous_keys=keys)
print(left_full)
print(keys)

result, fac, intfac, leftsorter, leftcount = pd.pipeline_merge(
    left_full,
    right,
    factorizer=fac,