예제 #1
0
def dir_to_vectors(
        dir_in: str,
        lexical_parser: str,
        nb_jobs: int,
        get_dataset: bool = True,
        ext_in: str = ".txt",
        backend: str = "multiprocessing") -> Union[np.ndarray, pd.DataFrame]:
    backend = safe_concurrency_backend(backend)
    file_names = glob.glob(os.path.join(dir_in, '*' + ext_in))

    syntactic_transformer = StanfordSyntacticTransformer(file_names=False)
    if lexical_parser == 'anc':
        lexical_transformer = ANCStanfordLexicalTransformer(file_names=False)
    elif lexical_parser == 'bnc':
        lexical_transformer = BNCStanfordLexicalTransformer(file_names=False)
    else:
        raise TypeError("Unknown lexical parser type '%s'" % lexical_parser)
    text_preprocessor = TextPreprocessor()

    LOG.info("Creating vectors")
    vectors = Parallel(n_jobs=nb_jobs, verbose=5, backend=backend)(
        delayed(file_to_vector)(f, text_preprocessor, syntactic_transformer,
                                lexical_transformer) for f in file_names)
    if get_dataset:
        LOG.debug("Creating dataset")
        dataset = pd.DataFrame(vectors,
                               columns=get_full_features_names(
                                   syntactic_transformer, lexical_transformer))
        LOG.debug("Adding filenames to dataset")
        dataset.insert(
            0, 'filename',
            [os.path.splitext(os.path.basename(f))[0] for f in file_names])
        return dataset
    else:
        return np.array(vectors)
예제 #2
0
def dir_to_entities(dir_in: str, text_preprocessor: TextPreprocessor, client: DBpediaSpotlightClient,
                    n_jobs: int, backend: str = 'multiprocessing', in_ext: str = ".txt") -> List[TextConcepts]:
    backend = safe_concurrency_backend(backend, urllib_used=True)

    return Parallel(n_jobs=n_jobs, verbose=5, backend=backend)(
        delayed(text_to_entities)(filename, text_preprocessor, client, True)
        for filename in glob.glob(os.path.join(dir_in, '*' + in_ext)))
예제 #3
0
def texts_to_entities(texts: Iterable[str], text_preprocessor: TextPreprocessor,
                      client: DBpediaSpotlightClient, n_jobs: int, backend: str = 'multiprocessing') \
        -> List[TextConcepts]:
    backend = safe_concurrency_backend(backend, urllib_used=True)

    return Parallel(n_jobs=n_jobs, verbose=5, backend=backend)(
        delayed(text_to_entities)(text, text_preprocessor, client) for text in texts)
예제 #4
0
def concepts_dir_to_graph_files(dir_in: str,
                                dir_out: str,
                                graph_builder: GraphBuilder,
                                n_jobs: int,
                                backend: str = 'multiprocessing',
                                force_rewrite: bool = False,
                                in_ext: str = ".json",
                                out_ext: str = ".json"):
    backend = safe_concurrency_backend(backend, heavy_sharing=True)

    gen = glob.glob(os.path.join(dir_in, '*' + in_ext))
    if dir_out:
        gen = ((file_in,
                os.path.join(
                    dir_out,
                    os.path.splitext(os.path.basename(file_in))[0] + out_ext))
               for file_in in gen)
        if not force_rewrite:
            gen = filter(lambda x: not os.path.exists(x[1]), gen)
    else:
        gen = ((f, None) for f in gen)

    with ModuleShutUpWarning('rdflib'):
        Parallel(n_jobs=n_jobs, verbose=5,
                 backend=backend)(delayed(compute_graph_from_concepts_file)(
                     f_in, graph_builder, f_out) for f_in, f_out in gen)
예제 #5
0
def compute_vectors_from_dir(dir_in: str, graph_reader: Callable[[str], Any], transformer: GraphTransformer,
                             n_jobs: int, ext_in: str = ".json", backend: str = 'multiprocessing',
                             to_dataset: bool = False):
    backend = safe_concurrency_backend(backend)
    filepaths = list(glob.glob(os.path.join(dir_in, '*' + ext_in)))
    gen = (graph_reader(filepath) for filepath in filepaths)
    res = compute_vectors(gen, transformer, n_jobs, backend, to_dataset)
    if to_dataset:
        LOG.info("Adding filenames to dataset")
        filenames = [os.path.splitext(os.path.basename(filepath))[0] for filepath in filepaths]
        res.insert(0, 'filename', filenames)
    return res
예제 #6
0
def compute_vectors(graphs: Iterable, transformer: GraphTransformer, n_jobs: int,
                    backend: str = 'multiprocessing', to_dataset: bool = False) \
        -> Union[List[List[float]], pd.DataFrame]:
    backend = safe_concurrency_backend(backend)
    with ModuleShutUpWarning('rdflib'):
        vectors = Parallel(n_jobs=n_jobs, verbose=5, backend=backend)(
            delayed(compute_vector_from_graph)(g, transformer) for g in graphs
        )
    if to_dataset:
        return pd.DataFrame(vectors, columns=transformer.get_features_names())
    else:
        return vectors
예제 #7
0
def dir_to_entities_json_files(dir_in: str, dir_out: str, text_preprocessor: TextPreprocessor,
                               client: DBpediaSpotlightClient, n_jobs: int, backend: str = 'multiprocessing',
                               force_rewrite: bool = False, in_ext: str = ".txt", out_ext: str = ".json"):
    backend = safe_concurrency_backend(backend, urllib_used=True)

    file_names = ((file_in, os.path.join(dir_out, os.path.splitext(os.path.basename(file_in))[0] + out_ext))
                  for file_in in glob.glob(os.path.join(dir_in, '*' + in_ext)))
    if not force_rewrite:
        file_names = filter(lambda x: not os.path.exists(x[1]), file_names)

    Parallel(n_jobs=n_jobs, verbose=5, backend=backend)(
        delayed(text_to_json_file)(f[0], f[1], text_preprocessor, client, True) for f in file_names)
예제 #8
0
def compute_graphs(texts_concepts: Iterable[TextConcepts],
                   graph_builder: GraphBuilder,
                   n_jobs: int,
                   backend: str = 'multiprocessing',
                   out_filenames: Iterable[str] = None) -> Iterable[Any]:
    backend = safe_concurrency_backend(backend, heavy_sharing=True)

    if out_filenames is None:
        out_filenames = []
    gen = zip_longest(texts_concepts, out_filenames)
    with ModuleShutUpWarning('rdflib'):
        return Parallel(n_jobs=n_jobs, verbose=5, backend=backend)(
            delayed(compute_graph)(tc, graph_builder, of) for tc, of in gen)