def dir_to_vectors( dir_in: str, lexical_parser: str, nb_jobs: int, get_dataset: bool = True, ext_in: str = ".txt", backend: str = "multiprocessing") -> Union[np.ndarray, pd.DataFrame]: backend = safe_concurrency_backend(backend) file_names = glob.glob(os.path.join(dir_in, '*' + ext_in)) syntactic_transformer = StanfordSyntacticTransformer(file_names=False) if lexical_parser == 'anc': lexical_transformer = ANCStanfordLexicalTransformer(file_names=False) elif lexical_parser == 'bnc': lexical_transformer = BNCStanfordLexicalTransformer(file_names=False) else: raise TypeError("Unknown lexical parser type '%s'" % lexical_parser) text_preprocessor = TextPreprocessor() LOG.info("Creating vectors") vectors = Parallel(n_jobs=nb_jobs, verbose=5, backend=backend)( delayed(file_to_vector)(f, text_preprocessor, syntactic_transformer, lexical_transformer) for f in file_names) if get_dataset: LOG.debug("Creating dataset") dataset = pd.DataFrame(vectors, columns=get_full_features_names( syntactic_transformer, lexical_transformer)) LOG.debug("Adding filenames to dataset") dataset.insert( 0, 'filename', [os.path.splitext(os.path.basename(f))[0] for f in file_names]) return dataset else: return np.array(vectors)
def dir_to_entities(dir_in: str, text_preprocessor: TextPreprocessor, client: DBpediaSpotlightClient, n_jobs: int, backend: str = 'multiprocessing', in_ext: str = ".txt") -> List[TextConcepts]: backend = safe_concurrency_backend(backend, urllib_used=True) return Parallel(n_jobs=n_jobs, verbose=5, backend=backend)( delayed(text_to_entities)(filename, text_preprocessor, client, True) for filename in glob.glob(os.path.join(dir_in, '*' + in_ext)))
def texts_to_entities(texts: Iterable[str], text_preprocessor: TextPreprocessor, client: DBpediaSpotlightClient, n_jobs: int, backend: str = 'multiprocessing') \ -> List[TextConcepts]: backend = safe_concurrency_backend(backend, urllib_used=True) return Parallel(n_jobs=n_jobs, verbose=5, backend=backend)( delayed(text_to_entities)(text, text_preprocessor, client) for text in texts)
def concepts_dir_to_graph_files(dir_in: str, dir_out: str, graph_builder: GraphBuilder, n_jobs: int, backend: str = 'multiprocessing', force_rewrite: bool = False, in_ext: str = ".json", out_ext: str = ".json"): backend = safe_concurrency_backend(backend, heavy_sharing=True) gen = glob.glob(os.path.join(dir_in, '*' + in_ext)) if dir_out: gen = ((file_in, os.path.join( dir_out, os.path.splitext(os.path.basename(file_in))[0] + out_ext)) for file_in in gen) if not force_rewrite: gen = filter(lambda x: not os.path.exists(x[1]), gen) else: gen = ((f, None) for f in gen) with ModuleShutUpWarning('rdflib'): Parallel(n_jobs=n_jobs, verbose=5, backend=backend)(delayed(compute_graph_from_concepts_file)( f_in, graph_builder, f_out) for f_in, f_out in gen)
def compute_vectors_from_dir(dir_in: str, graph_reader: Callable[[str], Any], transformer: GraphTransformer, n_jobs: int, ext_in: str = ".json", backend: str = 'multiprocessing', to_dataset: bool = False): backend = safe_concurrency_backend(backend) filepaths = list(glob.glob(os.path.join(dir_in, '*' + ext_in))) gen = (graph_reader(filepath) for filepath in filepaths) res = compute_vectors(gen, transformer, n_jobs, backend, to_dataset) if to_dataset: LOG.info("Adding filenames to dataset") filenames = [os.path.splitext(os.path.basename(filepath))[0] for filepath in filepaths] res.insert(0, 'filename', filenames) return res
def compute_vectors(graphs: Iterable, transformer: GraphTransformer, n_jobs: int, backend: str = 'multiprocessing', to_dataset: bool = False) \ -> Union[List[List[float]], pd.DataFrame]: backend = safe_concurrency_backend(backend) with ModuleShutUpWarning('rdflib'): vectors = Parallel(n_jobs=n_jobs, verbose=5, backend=backend)( delayed(compute_vector_from_graph)(g, transformer) for g in graphs ) if to_dataset: return pd.DataFrame(vectors, columns=transformer.get_features_names()) else: return vectors
def dir_to_entities_json_files(dir_in: str, dir_out: str, text_preprocessor: TextPreprocessor, client: DBpediaSpotlightClient, n_jobs: int, backend: str = 'multiprocessing', force_rewrite: bool = False, in_ext: str = ".txt", out_ext: str = ".json"): backend = safe_concurrency_backend(backend, urllib_used=True) file_names = ((file_in, os.path.join(dir_out, os.path.splitext(os.path.basename(file_in))[0] + out_ext)) for file_in in glob.glob(os.path.join(dir_in, '*' + in_ext))) if not force_rewrite: file_names = filter(lambda x: not os.path.exists(x[1]), file_names) Parallel(n_jobs=n_jobs, verbose=5, backend=backend)( delayed(text_to_json_file)(f[0], f[1], text_preprocessor, client, True) for f in file_names)
def compute_graphs(texts_concepts: Iterable[TextConcepts], graph_builder: GraphBuilder, n_jobs: int, backend: str = 'multiprocessing', out_filenames: Iterable[str] = None) -> Iterable[Any]: backend = safe_concurrency_backend(backend, heavy_sharing=True) if out_filenames is None: out_filenames = [] gen = zip_longest(texts_concepts, out_filenames) with ModuleShutUpWarning('rdflib'): return Parallel(n_jobs=n_jobs, verbose=5, backend=backend)( delayed(compute_graph)(tc, graph_builder, of) for tc, of in gen)