Exemplo n.º 1
0
def after_step(context, step):

    if step.status == "failed" and context.create_log is True:
        append_logfile(context, step)

    if step.status == "failed" and context.create_smartview is True and hasattr(
            context, "falseguids"):
        # print(context.falseguids)
        add_smartview(context.smview_file, step.name, context.falseguids)
Exemplo n.º 2
0
    def preprocess_wos_articles_from_dir(self, data_dir, preprocessing,
                                         remove_stopwords, nodes_to_analyze):
        """Method to extract, load and preprocess article metadata provided
        by Web of Science with support for multiprocessing.

        Parameters
        ----------
        data_dir : str
        preprocessing : str
            Specifies which preprocessing method to apply. Supported strings:
            'word_tokenize', 'sentences_with_lemmas', 'pos_tag', and 'lemmatize'.

        remove_stopwords : str
            Specifies which stopword-list to apply. Supported strings: Nltk-Stopwords

        nodes_to_analyze: Nodes
            nodes that have to be analyzed

        Returns
        -------
        assets : list(Asset)

        """
        start = timeit.default_timer()
        stream_processing_jobs = []

        for root, dirs, files in os.walk(data_dir):
            for name in files:
                file_path = data_dir + name
                stream_processing_job = {"preprocessing": preprocessing,
                                         "remove_stopwords": remove_stopwords,
                                         'file_path': file_path,
                                         'nodes_to_analyze': nodes_to_analyze,
                                         'stopwords': self.stop_words}
                stream_processing_jobs.append(stream_processing_job)

        p = Pool(processes=cpu_count() - 1)
        assets = p.map(stream_preprocessing, stream_processing_jobs)
        p.close()
        p.join()
        assets = list(itertools.chain.from_iterable(assets))

        # Logfile
        stop = timeit.default_timer()
        runtime = stop - start
        event_title = "Load and preprocess Academic Data from Directory"
        event_description = \
            "Importing " + str(len(assets)) + \
            " academic assets from directory into assetlist." \
            + " Preprocessing = " + str(preprocessing)
        append_logfile(logfile_path=self.logfile_path,
                       event_title=event_title,
                       event_description=event_description,
                       runtime=runtime)
        return assets
 def stop_timer_and_log(self, details=''):
     """Log to logfile"""
     stop_time = timeit.default_timer()
     runtime = stop_time - self.start_time
     event_title = 'Algorithm: ' + self.alg_name
     event_description = details
     if self.logfile_path is not None:
         append_logfile(logfile_path=self.logfile_path,
                        event_title=event_title,
                        event_description=event_description,
                        runtime=runtime)
Exemplo n.º 4
0
def load_assetlist_from_dir(file_path, logfile_path=None):
    start_time = timeit.default_timer()
    with open(file_path, "rb") as fp:
        assetlist = pickle.load(fp)
    event_description = str(
        len(assetlist)) + ' Assets loaded from ' + file_path
    stop_time = timeit.default_timer()
    runtime = stop_time - start_time
    event_title = 'Load list of assets'
    if logfile_path is not None:
        append_logfile(logfile_path=logfile_path,
                       event_title=event_title,
                       event_description=event_description,
                       runtime=runtime)

    return assetlist
Exemplo n.º 5
0
    def preprocess_patent_files_from_dir(
            self, data_dir, preprocessing, remove_stopwords, nodes_to_analyze,
            filter_patents_by_node):
        """Method to extract, load and preprocess patent data parsed
        by our uspto_xml_parser with support for multiprocessing.

        Parameters
        ----------
        data_dir : str
        preprocessing : str
            Specifies which preprocessing method to apply. Supported strings:
            'word_tokenize', 'sentences_with_lemmas', 'pos_tag', and 'lemmatize'.

        remove_stopwords : str
            Specifies which stopword-list to apply. Supported strings: Nltk-Stopwords

        nodes_to_analyze: Nodes
            nodes that have to be analyzed

        filter_patents_by_node: bool

        Returns
        -------
        nothing

        """
        start = timeit.default_timer()
        stream_processing_jobs = []

        for root, dirs, files in os.walk(data_dir):
            for name in files:
                file_path = data_dir + name
                if os.path.getsize(file_path) > 0:
                    stream_processing_job = {"preprocessing": preprocessing,
                                             "remove_stopwords": remove_stopwords,
                                             "file_path": file_path,
                                             "nodes_to_analyze": nodes_to_analyze,
                                             "filter_patents_by_node": filter_patents_by_node}
                    stream_processing_jobs.append(stream_processing_job)
                else:
                    print("Empty File!")

        p = Pool(processes=cpu_count()-1, maxtasksperchild=1)
        asset_cnt = 0
        for assets in p.imap_unordered(stream_preprocessing, stream_processing_jobs):
            nodes_to_analyze.enrich_with_assets(assets)
            print("Imported " + str(len(assets)) + " assets into nodes")
            asset_cnt = asset_cnt + len(assets)
        p.close()
        p.join()

        # Logfile
        stop = timeit.default_timer()
        runtime = stop - start
        event_title = "Load and preprocess Patent Data from Directory"
        event_description = \
            "Importing " + str(asset_cnt) + " patents from directory " \
                                              "into assetlist." \
            + " Preprocessing = " + str(preprocessing)
        append_logfile(logfile_path=self.logfile_path,
                       event_title=event_title,
                       event_description=event_description,
                       runtime=runtime)