Exemplo n.º 1
0
def transform(name=None, input_file=None):
    transformer = Transformer(name)

    # load the Graph representing the scraped datasets
    GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name),
                            file_stem_name=name)
    # get the loaded graph
    graph = GraphWrapper.get_graph()

    if not input_file:
        out_file = os.path.join(OUTPUT_DIR, 'transformers', 'deduplicate',
                                f'deduplicated_{name or "all"}.lst')
    else:
        out_file = input_file

    with open(out_file, 'w') as fp:
        for fname in transformer.urls_dict.values():
            fp.write(fname + '\n')

    vertex_seq = find_duplicate_dataset_vertices(
        graph, list(transformer.urls_dict.values()))
    remove_vertices(graph, vertex_seq)

    # write the graph to files
    # this method is explicitly thread/proccess safe, so no need for lock
    GraphWrapper.write_graph(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'),
                                                "graphs", f"{name}"),
                             file_stem_name=f'{name}.deduplicate')
    # create the page legend file for this graph
    GraphWrapper.create_graph_page_legend(file_dir_path=Path(
        os.getenv('ED_OUTPUT_PATH'), "graphs", f"{name}"),
                                          file_stem_name=f'{name}.deduplicate')

    logger.success('Deduplicated list is ready.')
Exemplo n.º 2
0
def transform(name=None, input_file=None):
    """
    function is responsible for transforming raw datasets into Collections
    """

    if not name: # user has not provided a scraper name to get collections with
        logger.error('Scraper/Office name not provided. Cannot generate collections')
        sys.exit(1)
    try:
        # load the Graph representing the deduplicated scraped datasets
        GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name),
                            file_stem_name=f'{name}.deduplicate')
        
    except:
        # load the Graph representing the scraped datasets
        GraphWrapper.load_graph(file_dir_path=Path(OUTPUT_DIR, 'graphs', name),
                            file_stem_name=name)
        
    # get the loaded graph
    graph = GraphWrapper.get_graph()

    # identify collections within the graph
    identify_collections_within_graph(graph)
    # link dataset vertices to their appropriate collection(s) within the graph
    link_datasets_to_collections_in_graph(graph)
    # write the identified collections to the raw dataset files
    add_collections_to_raw_datasets(graph=graph,
                                    output_dir=OUTPUT_DIR)

    # write the graph to files
    # this method is explicitly thread/proccess safe, so no need for lock
    GraphWrapper.write_graph(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), 
                                                        "graphs", f"{name}"),
                                        file_stem_name=f'{name}.collections')
    # create the page legend file for this graph
    GraphWrapper.create_graph_page_legend(file_dir_path=Path(os.getenv('ED_OUTPUT_PATH'), 
                                                        "graphs", f"{name}"),
                                         file_stem_name=f'{name}.collections')                                    

    
    # create the collections.json file                                      
    collections_list = [] # holds the list of collections acquired from graph

    with graph.graph_lock:
        for collection in graph.vs.select(is_collection_eq=True, name_ne='base_vertex'):
            collections_list.append({'collection_id': collection['collection_id'],
                                     'collection_title': collection['title'],
                                      'collection_url': collection['name']})
    
    # get a list of non-duplicate collections
    collections_list = get_distinct_collections_from(collections_list,
                                                     min_occurence_counter=1)
    # get the path were the gotten Collections will be saved to on local disk
    file_output_path = f'{CURRENT_TRANSFORMER_OUTPUT_DIR}/{(name or "all")}.collections.json'
    # write to file the collections gotten from 'name' scraped output
    h.write_file(file_output_path, collections_list)
    # write file the collections gotten from 'name' scraped out to S3 bucket
    h.upload_to_s3_if_configured(file_output_path, 
                                 f'{(name or "all")}.collections.json')
Exemplo n.º 3
0
    def close_spider(self, spider):
        print("SPIDER CLOSED")

        # write the graph to files
        # this method is explicitly thread/proccess safe, so no need for lock
        GraphWrapper.write_graph(file_dir_path=Path(
            os.getenv('ED_OUTPUT_PATH'), "graphs", f"{spider.name}"),
                                 file_stem_name=spider.name)

        # create the page legend file for this graph
        # this method is explicitly thread/proccess safe, so no need for lock
        GraphWrapper.create_graph_page_legend(file_dir_path=Path(
            os.getenv('ED_OUTPUT_PATH'), "graphs", f"{spider.name}"),
                                              file_stem_name=spider.name)