def identify_datasets_with_multi_collections(graph_file_path): """ function identify datasets with multiple collections """ if isinstance(graph_file_path, (str, pathlib.Path)): # load the graph from the filepath provided GraphWrapper.load_graph( file_dir_path=pathlib.Path(graph_file_path).parent, file_stem_name=pathlib.Path(graph_file_path).stem) # get the loaded graph object graph = GraphWrapper.get_graph() with graph.graph_lock: # select datasets vertices that are in multiple collections dataset_ver_seq = graph.vs.select(is_dataset_eq=True, name_ne='base_vertex').\ select(lambda vertex: 'in_collection' in vertex.attribute_names() and vertex['in_collection'] is not None and len(vertex['in_collection']) > 1) # get the name of the office this graph belongs to office_name = pathlib.Path(graph_file_path).stem.split('.')[0] dataset_ver_seq['office_name'] = office_name # info user that there are datasets with multiple collections print( f'There are {len(dataset_ver_seq)} datasets with links to multiple Collections within the {office_name.upper()} office' ) return dataset_ver_seq else: raise TypeError("Invalid 'graph_file_path' specified")
def transform(name=None, input_file=None): """ function is responsible for transforming raw datasets into Collections """ if not name: # user has not provided a scraper name to get collections with logger.error('Scraper/Office name not provided. Cannot generate collections') sys.exit(1) try: # load the Graph representing the deduplicated scraped datasets GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name), file_stem_name=f'{name}.deduplicate') except: # load the Graph representing the scraped datasets GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name), file_stem_name=name) # get the loaded graph graph = GraphWrapper.get_graph() # identify collections within the graph identify_collections_within_graph(graph) # link dataset vertices to their appropriate collection(s) within the graph link_datasets_to_collections_in_graph(graph) # write the identified collections to the raw dataset files add_collections_to_raw_datasets(graph=graph, output_dir=OUTPUT_DIR) # write the graph to files # this method is explicitly thread/proccess safe, so no need for lock GraphWrapper.write_graph(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), "graphs", f"{name}"), file_stem_name=f'{name}.collections') # create the page legend file for this graph GraphWrapper.create_graph_page_legend(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), "graphs", f"{name}"), file_stem_name=f'{name}.collections') # create the collections.json file collections_list = [] # holds the list of collections acquired from graph with graph.graph_lock: for collection in graph.vs.select(is_collection_eq=True, name_ne='base_vertex'): collections_list.append({'collection_id': collection['collection_id'], 'collection_title': collection['title'], 'collection_url': collection['name']}) # get a list of non-duplicate collections collections_list = get_distinct_collections_from(collections_list, min_occurence_counter=1) # get the path were the gotten Collections will be saved to on local disk file_output_path = f'{CURRENT_TRANSFORMER_OUTPUT_DIR}/{(name or "all")}.collections.json' # write to file the collections gotten from 'name' scraped output h.write_file(file_output_path, collections_list) # write file the collections gotten from 'name' scraped out to S3 bucket h.upload_to_s3_if_configured(file_output_path, f'{(name or "all")}.collections.json')
def close_spider(self, spider): print("SPIDER CLOSED") # write the graph to files # this method is explicitly thread/proccess safe, so no need for lock GraphWrapper.write_graph(file_dir_path=Path( os.getenv('ED_OUTPUT_PATH'), "graphs", f"{spider.name}"), file_stem_name=spider.name) # create the page legend file for this graph # this method is explicitly thread/proccess safe, so no need for lock GraphWrapper.create_graph_page_legend(file_dir_path=Path( os.getenv('ED_OUTPUT_PATH'), "graphs", f"{spider.name}"), file_stem_name=spider.name)
def transform(name=None, input_file=None): transformer = Transformer(name) # load the Graph representing the scraped datasets GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name), file_stem_name=name) # get the loaded graph graph = GraphWrapper.get_graph() if not input_file: out_file = os.path.join(OUTPUT_DIR, 'transformers', 'deduplicate', f'deduplicated_{name or "all"}.lst') else: out_file = input_file with open(out_file, 'w') as fp: for fname in transformer.urls_dict.values(): fp.write(fname + '\n') vertex_seq = find_duplicate_dataset_vertices( graph, list(transformer.urls_dict.values())) remove_vertices(graph, vertex_seq) # write the graph to files # this method is explicitly thread/proccess safe, so no need for lock GraphWrapper.write_graph(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), "graphs", f"{name}"), file_stem_name=f'{name}.deduplicate') # create the page legend file for this graph GraphWrapper.create_graph_page_legend(file_dir_path=Path( os.getenv('ED_OUTPUT_PATH'), "graphs", f"{name}"), file_stem_name=f'{name}.deduplicate') logger.success('Deduplicated list is ready.')
def open_spider(self, spider): # create the folder for storing graph files Path(os.getenv('ED_OUTPUT_PATH'), "graphs", f"{spider.name}").\ mkdir(parents=True, exist_ok=True) print("SPIDER STARTED") # setup the graph objet for this scraper # set the graph object as a class attribute if not hasattr(spider, 'scraper_graph'): spider.__class__.scraper_graph = GraphWrapper.get_graph()