def identify_datasets_with_multi_collections(graph_file_path):
    """ function identify datasets with multiple collections """

    if isinstance(graph_file_path, (str, pathlib.Path)):
        # load the graph from the filepath provided
        GraphWrapper.load_graph(
            file_dir_path=pathlib.Path(graph_file_path).parent,
            file_stem_name=pathlib.Path(graph_file_path).stem)
        # get the loaded graph object
        graph = GraphWrapper.get_graph()

        with graph.graph_lock:
            # select datasets vertices that are in multiple collections
            dataset_ver_seq = graph.vs.select(is_dataset_eq=True, name_ne='base_vertex').\
                select(lambda vertex: 'in_collection' in vertex.attribute_names() and vertex['in_collection'] is not None and len(vertex['in_collection']) > 1)

            # get the name of the office this graph belongs to
            office_name = pathlib.Path(graph_file_path).stem.split('.')[0]
            dataset_ver_seq['office_name'] = office_name
            # info user that there are datasets with multiple collections
            print(
                f'There are {len(dataset_ver_seq)} datasets with links to multiple Collections within the {office_name.upper()} office'
            )

            return dataset_ver_seq
    else:
        raise TypeError("Invalid 'graph_file_path' specified")
Exemplo n.º 2
0
def transform(name=None, input_file=None):
    """
    function is responsible for transforming raw datasets into Collections
    """

    if not name: # user has not provided a scraper name to get collections with
        logger.error('Scraper/Office name not provided. Cannot generate collections')
        sys.exit(1)
    try:
        # load the Graph representing the deduplicated scraped datasets
        GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name),
                            file_stem_name=f'{name}.deduplicate')
        
    except:
        # load the Graph representing the scraped datasets
        GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name),
                            file_stem_name=name)
        
    # get the loaded graph
    graph = GraphWrapper.get_graph()

    # identify collections within the graph
    identify_collections_within_graph(graph)
    # link dataset vertices to their appropriate collection(s) within the graph
    link_datasets_to_collections_in_graph(graph)
    # write the identified collections to the raw dataset files
    add_collections_to_raw_datasets(graph=graph,
                                    output_dir=OUTPUT_DIR)

    # write the graph to files
    # this method is explicitly thread/proccess safe, so no need for lock
    GraphWrapper.write_graph(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), 
                                                        "graphs", f"{name}"),
                                        file_stem_name=f'{name}.collections')
    # create the page legend file for this graph
    GraphWrapper.create_graph_page_legend(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), 
                                                        "graphs", f"{name}"),
                                         file_stem_name=f'{name}.collections')                                    

    
    # create the collections.json file                                      
    collections_list = [] # holds the list of collections acquired from graph

    with graph.graph_lock:
        for collection in graph.vs.select(is_collection_eq=True, name_ne='base_vertex'):
            collections_list.append({'collection_id': collection['collection_id'],
                                     'collection_title': collection['title'],
                                      'collection_url': collection['name']})
    
    # get a list of non-duplicate collections
    collections_list = get_distinct_collections_from(collections_list,
                                                     min_occurence_counter=1)
    # get the path were the gotten Collections will be saved to on local disk
    file_output_path = f'{CURRENT_TRANSFORMER_OUTPUT_DIR}/{(name or "all")}.collections.json'
    # write to file the collections gotten from 'name' scraped output
    h.write_file(file_output_path, collections_list)
    # write file the collections gotten from 'name' scraped out to S3 bucket
    h.upload_to_s3_if_configured(file_output_path, 
                                 f'{(name or "all")}.collections.json')
Exemplo n.º 3
0
    def close_spider(self, spider):
        print("SPIDER CLOSED")

        # write the graph to files
        # this method is explicitly thread/proccess safe, so no need for lock
        GraphWrapper.write_graph(file_dir_path=Path(
            os.getenv('ED_OUTPUT_PATH'), "graphs", f"{spider.name}"),
                                 file_stem_name=spider.name)

        # create the page legend file for this graph
        # this method is explicitly thread/proccess safe, so no need for lock
        GraphWrapper.create_graph_page_legend(file_dir_path=Path(
            os.getenv('ED_OUTPUT_PATH'), "graphs", f"{spider.name}"),
                                              file_stem_name=spider.name)
Exemplo n.º 4
0
def transform(name=None, input_file=None):
    transformer = Transformer(name)

    # load the Graph representing the scraped datasets
    GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name),
                            file_stem_name=name)
    # get the loaded graph
    graph = GraphWrapper.get_graph()

    if not input_file:
        out_file = os.path.join(OUTPUT_DIR, 'transformers', 'deduplicate',
                                f'deduplicated_{name or "all"}.lst')
    else:
        out_file = input_file

    with open(out_file, 'w') as fp:
        for fname in transformer.urls_dict.values():
            fp.write(fname + '\n')

    vertex_seq = find_duplicate_dataset_vertices(
        graph, list(transformer.urls_dict.values()))
    remove_vertices(graph, vertex_seq)

    # write the graph to files
    # this method is explicitly thread/proccess safe, so no need for lock
    GraphWrapper.write_graph(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'),
                                                "graphs", f"{name}"),
                             file_stem_name=f'{name}.deduplicate')
    # create the page legend file for this graph
    GraphWrapper.create_graph_page_legend(file_dir_path=Path(
        os.getenv('ED_OUTPUT_PATH'), "graphs", f"{name}"),
                                          file_stem_name=f'{name}.deduplicate')

    logger.success('Deduplicated list is ready.')
Exemplo n.º 5
0
 def open_spider(self, spider):
     # create the folder for storing graph files
     Path(os.getenv('ED_OUTPUT_PATH'), "graphs", f"{spider.name}").\
                                               mkdir(parents=True, exist_ok=True)
     print("SPIDER STARTED")
     # setup the graph objet for this scraper
     # set the graph object as a class attribute
     if not hasattr(spider, 'scraper_graph'):
         spider.__class__.scraper_graph = GraphWrapper.get_graph()