Пример #1
0
def create_many_family_mock_event(files, parser=None):
    # TODO: this will break for matio
    mock_event = dict()

    fam_batch = FamilyBatch()
    family_id = None

    for file in files:

        if type(file) is dict:
            family_id = str(file['family_id'])
            file = file['filename']

        test_fam_1 = Family()
        group_file_objs = []

        base_path = file
        group_file_objs.append({'path': base_path, 'metadata': dict()})
        test_fam_1.download_type = "LOCAL"

        test_fam_1.add_group(files=group_file_objs, parser=parser)
        if family_id is not None:
            test_fam_1.family_id = family_id
        fam_batch.add_family(test_fam_1)

    mock_event['family_batch'] = fam_batch
    return mock_event
Пример #2
0
def nothing_extract(event):

    """
    Function
    :param event (dict) -- contains auth header and list of HTTP links to extractable files:
    :return metadata (dict) -- metadata as gotten from the materials_io library:
    """

    import time

    import os
    import sys

    from xtract_sdk.packagers.family import Family
    from xtract_sdk.packagers.family_batch import FamilyBatch

    t0 = time.time()

    sys.path.insert(1, '/')

    # A list of file paths
    all_families = event['family_batch']

    if type(all_families) == dict:
        family_batch = FamilyBatch()
        for family in all_families["families"]:
            fam = Family()
            fam.from_dict(family)
            family_batch.add_family(fam)
        all_families = family_batch

    for family in all_families.families:
        family_id = family.family_id
        fam_files = family.files
        headers = family.headers

        for file_obj in fam_files:

            # new_path = os.path.join(family_id, local_filename)

            for i in range(10):
                with open(file_obj['path'], 'r') as f:
                    f.close()

    t1 = time.time()

    return {"family_batch": all_families,
            "container_version": os.environ["container_version"],
            "transfer_time": 0,
            "import_time": 0,
            "family_fetch_time": 0,
            "file_unpack_time": 0,
            "full_extract_loop_time": 0,
            "total_time": t1 - t0
            }
Пример #3
0
def create_mock_event(files, parser=None):
    mock_event = dict()

    fam_batch = FamilyBatch()

    test_fam_1 = Family()
    group_file_objs = []

    for file in files:

        base_path = file
        group_file_objs.append({'path': base_path, 'metadata': dict()})
        test_fam_1.download_type = "LOCAL"

    test_fam_1.add_group(files=group_file_objs, parser=parser)
    fam_batch.add_family(test_fam_1)

    mock_event['family_batch'] = fam_batch
    return mock_event
Пример #4
0
    # Execute the extractor on our family_batch.
    xtra.execute_extractions(family_batch=event['family_batch'],
                             input_type=str)

    # All metadata are held in XtractAgent's memory. Flush to disk!
    xtra.flush_metadata_to_files(writer='json')

    return xtra.get_completion_stats()


mock_event = dict()

test_fam_1 = Family()
test_fam_2 = Family()

base_path = "/Users/tylerskluzacek/xtract-sdk/tests/xtract-tabular/tests/test_files"
test_fam_1.add_group(files=[{
    'path': os.path.join(base_path, 'comma_delim'),
    'metadata': dict()
}],
                     parser=None)
test_fam_1.download_type = "LOCAL"
print(test_fam_1.to_dict())

fam_batch = FamilyBatch()
fam_batch.add_family(test_fam_1)
mock_event['family_batch'] = fam_batch

data = extract_tabular(mock_event)
print(data)
Пример #5
0
    def orch_thread(self, headers):
        to_terminate = False

        print(f"ENDPOINTS TO CHECK: {self.fx_eps_to_check}")
        all_extractors = get_all_extractors(self.fx_eps_to_check)
        print(f"Fetched all extractors... {all_extractors}")

        fxc = get_fx_client(headers)

        self.cur_status = "EXTRACTING"

        while True:

            # If our accounting is complete
            # NOTE: when concurrent, will also need to check if scheduling is DONE.
            if self.counters['fx']['success'] + \
                    self.counters['fx']['failed'] + \
                    self.counters['flagged_unknown'] == self.counters['cumu_scheduled'] \
                    and self.cur_status == 'SCHEDULED':
                to_terminate = True

            if to_terminate:
                print("[ORCH] Terminating!")
                print(f"Final counters: {self.counters}")
                self.cur_status = 'COMPLETED'  # TODO: Need to push this status to DB.
                break

            print(f"[ORCH] WQ length: {self.to_xtract_q.qsize()}")

            if self.to_xtract_q.empty() and self.funcx_current_tasks.empty():
                print(f"[ORCH] Empty work thread. Sleeping...")
                time.sleep(5)

            else:
                batch = fxc.create_batch()
                batch_len = 0
                while not self.to_xtract_q.empty(
                ):  # TODO: also need max batch size here.
                    family = self.to_xtract_q.get()
                    self.counters['cumu_orch_enter'] += 1

                    extractor_id = family['first_extractor']

                    if extractor_id in extractor_map:
                        extractor = extractor_map[extractor_id]
                    else:
                        self.counters['flagged_unknown'] += 1
                        continue

                    # We should not need to repack and add an empty base_url
                    fam_batch = FamilyBatch()
                    packed_family = Family()
                    family['base_url'] = None
                    packed_family.from_dict(family)

                    fam_batch.add_family(packed_family)

                    # TODO: hardcodes galore.
                    event = extractor.create_event(
                        family_batch=fam_batch,
                        ep_name='default',
                        xtract_dir="/home/tskluzac/.xtract",
                        sys_path_add="/",
                        module_path=f"xtract_{extractor_id}_main",
                        metadata_write_path='/home/tskluzac/mdata')

                    fx_ep_id = self.fx_eps_to_check[
                        0]  # TODO: Should not be fixed to first fx_ep.

                    print(f"Endpoint ID: {fx_ep_id}")
                    batch.add(
                        event,
                        endpoint_id=fx_ep_id,
                        function_id=all_extractors[f"xtract-{extractor_id}"]
                        [fx_ep_id])
                    batch_len += 1

                # Only want to send tasks if we retrieved tasks.
                if batch_len > 0:
                    batch_res = fxc.batch_run(batch)
                    time.sleep(1.1)
                    for item in batch_res:
                        self.funcx_current_tasks.put(item)

                poll_batch = []

                # print("Entering task loop")
                for i in range(0, 20):  # TODO: hardcode
                    if not self.funcx_current_tasks.empty():
                        tid = self.funcx_current_tasks.get()
                        poll_batch.append(tid)
                # print(f"Current length of poll_batch: {len(poll_batch)}")

                if len(poll_batch) > 0:
                    x = fxc.get_batch_result(poll_batch)
                    time.sleep(1.1)
                    # print(f"Poll result: {x}")
                    for item in x:
                        result = x[item]

                        if result['status'] == 'success':
                            self.counters['fx']['success'] += 1

                        elif result['status'] == 'failed':
                            result['exception'].reraise()
                            self.counters['fx']['failures'] += 1

                        elif result['pending']:
                            self.funcx_current_tasks.put(item)
                        else:
                            # If we haven't figured it out until here, we need some dev...
                            raise ValueError(
                                "[ORCH] CRITICAL Unrecognized funcX status...")
                    print(self.counters)
    def preproc_fam_batches(self):

        fam_count = 0

        # Just create an empty one out here so Python doesn't yell at me.
        fam_batch = FamilyBatch()

        num_overloads = 0
        # while we have files and haven't exceeded the weak scaling threshold (file_cutoff)
        while not self.family_queue.empty() and fam_count < file_cutoff:

            fam_batch = FamilyBatch()
            total_fam_batch_size = 0

            # Keep making batch until
            while len(fam_batch.families
                      ) < map_size and not self.family_queue.empty(
                      ) and fam_count < file_cutoff:

                fam_count += 1
                fam = self.family_queue.get()

                total_family_size = 0
                # First convert to the correct paths
                for file_obj in fam['files']:
                    old_path = file_obj['path']
                    new_path = self.path_converter(fam['family_id'], old_path)
                    file_obj['path'] = new_path
                    file_size = file_obj['metadata']['physical']['size']
                    total_family_size += file_size

                for group in fam['groups']:
                    for file_obj in group['files']:
                        old_path = file_obj['path']
                        new_path = self.path_converter(fam['family_id'],
                                                       old_path)
                        file_obj['path'] = new_path

                empty_fam = Family()
                empty_fam.from_dict(fam)

                # We will ONLY handle the SIZE issue in here.

                if soft_batch_bytes_max > 0:
                    # So if this last file would put us over the top,
                    if total_fam_batch_size + total_family_size > soft_batch_bytes_max:
                        num_overloads += 1
                        print(f"Num overloads {num_overloads}")
                        # then we append the old batch (if not empty),
                        if len(fam_batch.families) > 0:
                            self.fam_batches.append(fam_batch)

                        # empty the old one
                        fam_batch = FamilyBatch()
                        total_fam_batch_size = total_family_size

                        assert (len(fam_batch.families) == 0)

                # and then continue (here we either add to our prior fam_batch OR the new one).
                fam_batch.add_family(empty_fam)

            assert len(fam_batch.families) <= map_size

            self.fam_batches.append(fam_batch)

        # img_extractor = NothingExtractor()
        img_extractor = MatioExtractor()

        # TODO: ADDING TEST. Making sure we have all of our files here.

        ta = time.time()
        num_families = 0
        for item in self.fam_batches:
            num_families += len(item.families)

        print(num_families)
        tb = time.time()
        print(f"Time to move families: {tb-ta}")
        time.sleep(5)
        # exit()

        # exit()

        # This check makes sure our batches are the correct size to avoid the January 2021 disaster of having vastly
        #  incorrect numbers of batches.
        #
        #  Here we are checking that the number of families we are processing is LESS than the total number of
        #   batches times the batch size (e.g., the last batch can be full or empty), and the number of families
        #   is GREATER than the case where our last map is missing.
        #
        #
        #  This leaves a very small window for error. Could use modulus to be more exact.

        # TODO: Bring this back (but use for grouping by num. files)

        # try:
        #     assert len(self.fam_batches) * (map_size-1) <= fam_count <= len(self.fam_batches) * map_size
        # except AssertionError as e:
        #     print(f"Caught {e} after creating client batches...")
        #     print(f"Number of batches: {len(self.fam_batches)}")
        #     print(f"Family Count: {fam_count}")
        #
        #     print("Cannot continue. Exiting...")
        #     exit()

        print(f"Container type: {container_type}")
        print(f"Location: {location}")
        self.fn_uuid = img_extractor.register_function(
            container_type=container_type,
            location=location,
            ep_id=ep_id,
            group="a31d8dce-5d0a-11ea-afea-0a53601d30b5")

        # funcX batching. Here we take the 'user' FamilyBatch objects and put them into a batch we send to funcX.
        num_fx_batches = 0
        current_batch = []

        print(f"Number of family batches: {len(self.fam_batches)}")
        for fam_batch in self.fam_batches:

            # print(len(current_batch))
            # print(batch_size)

            if len(current_batch) < batch_size:
                current_batch.append(fam_batch)
            else:
                # print("Marking batch!")
                # print(len(current_batch))
                self.funcx_batches.put(current_batch)
                current_batch = [fam_batch]
                num_fx_batches += 1

        # Grab the stragglers.
        if len(current_batch) > 0:
            print("Marking batch!")
            self.funcx_batches.put(current_batch)
            num_fx_batches += 1

        # See same description as above (map example) for explanation.
        try:
            theor_full_batches = math.ceil(len(self.fam_batches) / batch_size)

            # print(f"Theoretical full batches: {}")
            assert theor_full_batches == num_fx_batches
        except AssertionError as e:
            print(f"Caught {e} after creating funcX batches...")
            print(f"Number of batches: {self.funcx_batches.qsize()}")
            print(f"Family Count: {num_fx_batches}")

            print("Cannot continue. Exiting...")
            exit()
Пример #7
0
family_1.add_group(files=[{
    'path': file_id,
    'is_gdoc': True,
    'mimeType': "text/csv"
}],
                   parser='xtract-tabular')
family_1.base_url = ""

family_2.add_group(files=[{
    'path': file_id2,
    'is_gdoc': False
}],
                   parser='xtract-tabular')
family_2.download_type = "GDRIVE"

fam_batch = FamilyBatch()
fam_batch.add_family(family_1)
fam_batch.add_family(family_2)


def test(event):
    import os
    return os.environ['container_version']


def main(fxc, ep_id):
    container_uuid = fxc.register_container('xtract-tabular.img',
                                            'singularity')
    print("Container UUID: {}".format(container_uuid))
    fn_uuid = fxc.register_function(
        test,
Пример #8
0
    def preproc_fam_batches(self):

        total_tasks = 0

        print("PREPROCESSING!")
        while not self.image_path_list.empty():

            fam_batch = FamilyBatch()
            # print(len(fam_batch.families))
            while len(fam_batch.families) < map_size:

                if self.image_path_list.empty():
                    break

                path = self.image_path_list.get()
                print(path)
                family = dict()

                family['family_id'] = None

                # TODO: CHANGE THIS FOR THETA.
                if system == 'midway2':
                    family['files'] = [{
                        'path':
                        f'/project2/chard/skluzacek/train2014/{path}'
                    }]
                elif system == 'theta':
                    family['files'] = [{
                        'path':
                        f'/projects/CSC249ADCD01/skluzacek/train2014/{path}'
                    }]
                family['metadata'] = dict()
                family['headers'] = None
                family['download_type'] = None
                family['groups'] = []

                empty_fam = Family()
                empty_fam.from_dict(family)
                print("ADDING FAMILY TO FAM BATCH")
                fam_batch.add_family(empty_fam)

            #if total_tasks > max_tasks:
            self.fam_batches.append(fam_batch)

        img_extractor = ImageExtractor()

        print(f"REGISTERING FUNCTION")
        self.fn_uuid = img_extractor.register_function(
            container_type=container_type,
            location=location,
            ep_id=ep_id,
            group="a31d8dce-5d0a-11ea-afea-0a53601d30b5")

        current_batch = []
        for fam_batch in self.fam_batches:
            if len(current_batch) < batch_size:
                current_batch.append(fam_batch)
            else:
                print(f"Length of current batch: {len(current_batch)}")
                self.funcx_batches.put(current_batch)
                current_batch = [fam_batch]

        # Grab the stragglers.
        if len(current_batch) > 0:
            self.funcx_batches.put(current_batch)

        print("Let me see")

        batch_counter = 0
Пример #9
0
assert fam.download_type == back_to_reg_fam.download_type, "to_dict -> from_dict family_ids do not match"

print(fam.files)
print(back_to_reg_fam.files)
assert fam.files == back_to_reg_fam.files

for group in back_to_reg_fam.groups:
    assert group in fam.groups, "to_dict -> from_dic group_ids do not map"
    assert fam.groups[group].metadata == back_to_reg_fam.groups[group].metadata
    assert fam.groups[group].parser == back_to_reg_fam.groups[group].parser
    assert fam.groups[group].files == back_to_reg_fam.groups[group].files

print("Passed all family packaging tests!")
time.sleep(1)

family_batch = FamilyBatch()

family_batch.add_family(back_to_reg_fam)
family_batch.add_family(fam2)

print(family_batch.families)
print(family_batch.file_ls)

desc_batch_files = sorted([item["path"] for item in family_batch.file_ls])
assert desc_batch_files == ['a', 'b', 'c', 'd', 'e', 'v', 'w', 'x', 'y', 'z'], \
    "family_batch not correctly getting files from families"

dict_batch = family_batch.to_dict()

print(dict_batch)