Пример #1
0
def nothing_extract(event):

    """
    Function
    :param event (dict) -- contains auth header and list of HTTP links to extractable files:
    :return metadata (dict) -- metadata as gotten from the materials_io library:
    """

    import time

    import os
    import sys

    from xtract_sdk.packagers.family import Family
    from xtract_sdk.packagers.family_batch import FamilyBatch

    t0 = time.time()

    sys.path.insert(1, '/')

    # A list of file paths
    all_families = event['family_batch']

    if type(all_families) == dict:
        family_batch = FamilyBatch()
        for family in all_families["families"]:
            fam = Family()
            fam.from_dict(family)
            family_batch.add_family(fam)
        all_families = family_batch

    for family in all_families.families:
        family_id = family.family_id
        fam_files = family.files
        headers = family.headers

        for file_obj in fam_files:

            # new_path = os.path.join(family_id, local_filename)

            for i in range(10):
                with open(file_obj['path'], 'r') as f:
                    f.close()

    t1 = time.time()

    return {"family_batch": all_families,
            "container_version": os.environ["container_version"],
            "transfer_time": 0,
            "import_time": 0,
            "family_fetch_time": 0,
            "file_unpack_time": 0,
            "full_extract_loop_time": 0,
            "total_time": t1 - t0
            }
Пример #2
0
    def orch_thread(self, headers):
        to_terminate = False

        print(f"ENDPOINTS TO CHECK: {self.fx_eps_to_check}")
        all_extractors = get_all_extractors(self.fx_eps_to_check)
        print(f"Fetched all extractors... {all_extractors}")

        fxc = get_fx_client(headers)

        self.cur_status = "EXTRACTING"

        while True:

            # If our accounting is complete
            # NOTE: when concurrent, will also need to check if scheduling is DONE.
            if self.counters['fx']['success'] + \
                    self.counters['fx']['failed'] + \
                    self.counters['flagged_unknown'] == self.counters['cumu_scheduled'] \
                    and self.cur_status == 'SCHEDULED':
                to_terminate = True

            if to_terminate:
                print("[ORCH] Terminating!")
                print(f"Final counters: {self.counters}")
                self.cur_status = 'COMPLETED'  # TODO: Need to push this status to DB.
                break

            print(f"[ORCH] WQ length: {self.to_xtract_q.qsize()}")

            if self.to_xtract_q.empty() and self.funcx_current_tasks.empty():
                print(f"[ORCH] Empty work thread. Sleeping...")
                time.sleep(5)

            else:
                batch = fxc.create_batch()
                batch_len = 0
                while not self.to_xtract_q.empty(
                ):  # TODO: also need max batch size here.
                    family = self.to_xtract_q.get()
                    self.counters['cumu_orch_enter'] += 1

                    extractor_id = family['first_extractor']

                    if extractor_id in extractor_map:
                        extractor = extractor_map[extractor_id]
                    else:
                        self.counters['flagged_unknown'] += 1
                        continue

                    # We should not need to repack and add an empty base_url
                    fam_batch = FamilyBatch()
                    packed_family = Family()
                    family['base_url'] = None
                    packed_family.from_dict(family)

                    fam_batch.add_family(packed_family)

                    # TODO: hardcodes galore.
                    event = extractor.create_event(
                        family_batch=fam_batch,
                        ep_name='default',
                        xtract_dir="/home/tskluzac/.xtract",
                        sys_path_add="/",
                        module_path=f"xtract_{extractor_id}_main",
                        metadata_write_path='/home/tskluzac/mdata')

                    fx_ep_id = self.fx_eps_to_check[
                        0]  # TODO: Should not be fixed to first fx_ep.

                    print(f"Endpoint ID: {fx_ep_id}")
                    batch.add(
                        event,
                        endpoint_id=fx_ep_id,
                        function_id=all_extractors[f"xtract-{extractor_id}"]
                        [fx_ep_id])
                    batch_len += 1

                # Only want to send tasks if we retrieved tasks.
                if batch_len > 0:
                    batch_res = fxc.batch_run(batch)
                    time.sleep(1.1)
                    for item in batch_res:
                        self.funcx_current_tasks.put(item)

                poll_batch = []

                # print("Entering task loop")
                for i in range(0, 20):  # TODO: hardcode
                    if not self.funcx_current_tasks.empty():
                        tid = self.funcx_current_tasks.get()
                        poll_batch.append(tid)
                # print(f"Current length of poll_batch: {len(poll_batch)}")

                if len(poll_batch) > 0:
                    x = fxc.get_batch_result(poll_batch)
                    time.sleep(1.1)
                    # print(f"Poll result: {x}")
                    for item in x:
                        result = x[item]

                        if result['status'] == 'success':
                            self.counters['fx']['success'] += 1

                        elif result['status'] == 'failed':
                            result['exception'].reraise()
                            self.counters['fx']['failures'] += 1

                        elif result['pending']:
                            self.funcx_current_tasks.put(item)
                        else:
                            # If we haven't figured it out until here, we need some dev...
                            raise ValueError(
                                "[ORCH] CRITICAL Unrecognized funcX status...")
                    print(self.counters)
    def preproc_fam_batches(self):

        fam_count = 0

        # Just create an empty one out here so Python doesn't yell at me.
        fam_batch = FamilyBatch()

        num_overloads = 0
        # while we have files and haven't exceeded the weak scaling threshold (file_cutoff)
        while not self.family_queue.empty() and fam_count < file_cutoff:

            fam_batch = FamilyBatch()
            total_fam_batch_size = 0

            # Keep making batch until
            while len(fam_batch.families
                      ) < map_size and not self.family_queue.empty(
                      ) and fam_count < file_cutoff:

                fam_count += 1
                fam = self.family_queue.get()

                total_family_size = 0
                # First convert to the correct paths
                for file_obj in fam['files']:
                    old_path = file_obj['path']
                    new_path = self.path_converter(fam['family_id'], old_path)
                    file_obj['path'] = new_path
                    file_size = file_obj['metadata']['physical']['size']
                    total_family_size += file_size

                for group in fam['groups']:
                    for file_obj in group['files']:
                        old_path = file_obj['path']
                        new_path = self.path_converter(fam['family_id'],
                                                       old_path)
                        file_obj['path'] = new_path

                empty_fam = Family()
                empty_fam.from_dict(fam)

                # We will ONLY handle the SIZE issue in here.

                if soft_batch_bytes_max > 0:
                    # So if this last file would put us over the top,
                    if total_fam_batch_size + total_family_size > soft_batch_bytes_max:
                        num_overloads += 1
                        print(f"Num overloads {num_overloads}")
                        # then we append the old batch (if not empty),
                        if len(fam_batch.families) > 0:
                            self.fam_batches.append(fam_batch)

                        # empty the old one
                        fam_batch = FamilyBatch()
                        total_fam_batch_size = total_family_size

                        assert (len(fam_batch.families) == 0)

                # and then continue (here we either add to our prior fam_batch OR the new one).
                fam_batch.add_family(empty_fam)

            assert len(fam_batch.families) <= map_size

            self.fam_batches.append(fam_batch)

        # img_extractor = NothingExtractor()
        img_extractor = MatioExtractor()

        # TODO: ADDING TEST. Making sure we have all of our files here.

        ta = time.time()
        num_families = 0
        for item in self.fam_batches:
            num_families += len(item.families)

        print(num_families)
        tb = time.time()
        print(f"Time to move families: {tb-ta}")
        time.sleep(5)
        # exit()

        # exit()

        # This check makes sure our batches are the correct size to avoid the January 2021 disaster of having vastly
        #  incorrect numbers of batches.
        #
        #  Here we are checking that the number of families we are processing is LESS than the total number of
        #   batches times the batch size (e.g., the last batch can be full or empty), and the number of families
        #   is GREATER than the case where our last map is missing.
        #
        #
        #  This leaves a very small window for error. Could use modulus to be more exact.

        # TODO: Bring this back (but use for grouping by num. files)

        # try:
        #     assert len(self.fam_batches) * (map_size-1) <= fam_count <= len(self.fam_batches) * map_size
        # except AssertionError as e:
        #     print(f"Caught {e} after creating client batches...")
        #     print(f"Number of batches: {len(self.fam_batches)}")
        #     print(f"Family Count: {fam_count}")
        #
        #     print("Cannot continue. Exiting...")
        #     exit()

        print(f"Container type: {container_type}")
        print(f"Location: {location}")
        self.fn_uuid = img_extractor.register_function(
            container_type=container_type,
            location=location,
            ep_id=ep_id,
            group="a31d8dce-5d0a-11ea-afea-0a53601d30b5")

        # funcX batching. Here we take the 'user' FamilyBatch objects and put them into a batch we send to funcX.
        num_fx_batches = 0
        current_batch = []

        print(f"Number of family batches: {len(self.fam_batches)}")
        for fam_batch in self.fam_batches:

            # print(len(current_batch))
            # print(batch_size)

            if len(current_batch) < batch_size:
                current_batch.append(fam_batch)
            else:
                # print("Marking batch!")
                # print(len(current_batch))
                self.funcx_batches.put(current_batch)
                current_batch = [fam_batch]
                num_fx_batches += 1

        # Grab the stragglers.
        if len(current_batch) > 0:
            print("Marking batch!")
            self.funcx_batches.put(current_batch)
            num_fx_batches += 1

        # See same description as above (map example) for explanation.
        try:
            theor_full_batches = math.ceil(len(self.fam_batches) / batch_size)

            # print(f"Theoretical full batches: {}")
            assert theor_full_batches == num_fx_batches
        except AssertionError as e:
            print(f"Caught {e} after creating funcX batches...")
            print(f"Number of batches: {self.funcx_batches.qsize()}")
            print(f"Family Count: {num_fx_batches}")

            print("Cannot continue. Exiting...")
            exit()
Пример #4
0
    def preproc_fam_batches(self):

        total_tasks = 0

        print("PREPROCESSING!")
        while not self.image_path_list.empty():

            fam_batch = FamilyBatch()
            # print(len(fam_batch.families))
            while len(fam_batch.families) < map_size:

                if self.image_path_list.empty():
                    break

                path = self.image_path_list.get()
                print(path)
                family = dict()

                family['family_id'] = None

                # TODO: CHANGE THIS FOR THETA.
                if system == 'midway2':
                    family['files'] = [{
                        'path':
                        f'/project2/chard/skluzacek/train2014/{path}'
                    }]
                elif system == 'theta':
                    family['files'] = [{
                        'path':
                        f'/projects/CSC249ADCD01/skluzacek/train2014/{path}'
                    }]
                family['metadata'] = dict()
                family['headers'] = None
                family['download_type'] = None
                family['groups'] = []

                empty_fam = Family()
                empty_fam.from_dict(family)
                print("ADDING FAMILY TO FAM BATCH")
                fam_batch.add_family(empty_fam)

            #if total_tasks > max_tasks:
            self.fam_batches.append(fam_batch)

        img_extractor = ImageExtractor()

        print(f"REGISTERING FUNCTION")
        self.fn_uuid = img_extractor.register_function(
            container_type=container_type,
            location=location,
            ep_id=ep_id,
            group="a31d8dce-5d0a-11ea-afea-0a53601d30b5")

        current_batch = []
        for fam_batch in self.fam_batches:
            if len(current_batch) < batch_size:
                current_batch.append(fam_batch)
            else:
                print(f"Length of current batch: {len(current_batch)}")
                self.funcx_batches.put(current_batch)
                current_batch = [fam_batch]

        # Grab the stragglers.
        if len(current_batch) > 0:
            self.funcx_batches.put(current_batch)

        print("Let me see")

        batch_counter = 0
Пример #5
0
}],
                           parser="potato")

assert type(
    group_id) is str, "fam.add_group is not returning an id of type str"
print(type(fam.files))

assert sorted([item["path"] for item in fam.files]) == ['a', 'b', 'c', 'd', 'e'], \
    "fam.files not properly inheriting group.files"
assert sorted([item["path"] for item in fam2.files]) == ['v', 'w', 'x', 'y', 'z'], \
    "fam.files not properly inheriting group.files"

# Here we test if going to_dict and from_dict leads us to our original family object.
dict_fam = fam.to_dict()
back_to_reg_fam = Family(download_type="gdrive")
back_to_reg_fam.from_dict(dict_fam)

assert fam.family_id == back_to_reg_fam.family_id, "to_dict -> from_dict family_ids do not match"
assert fam.download_type == back_to_reg_fam.download_type, "to_dict -> from_dict family_ids do not match"

print(fam.files)
print(back_to_reg_fam.files)
assert fam.files == back_to_reg_fam.files

for group in back_to_reg_fam.groups:
    assert group in fam.groups, "to_dict -> from_dic group_ids do not map"
    assert fam.groups[group].metadata == back_to_reg_fam.groups[group].metadata
    assert fam.groups[group].parser == back_to_reg_fam.groups[group].parser
    assert fam.groups[group].files == back_to_reg_fam.groups[group].files

print("Passed all family packaging tests!")