Exemplo n.º 1
0
def main(alto_fulltext_file, language_file, chunksize, processes):
    """
    Read the documents of the corpus from ALTO_FULLTEXT_FILE where each line of the .csv file
    describes one page.

    Foreach page classify its language by means of langid.
    Store the classification results as a pickled pandas DataFrame in LANGUAGE_FILE.
    """

    target_path = os.path.dirname(language_file)

    if len(target_path) > 0 and not os.path.exists(target_path):
        os.makedirs(target_path, exist_ok=True)

    if alto_fulltext_file.endswith('.csv'):
        chunks = get_csv_chunks(alto_fulltext_file, chunksize)
    elif alto_fulltext_file.endswith('.sqlite3'):
        chunks = get_sqlite_chunks(alto_fulltext_file, chunksize)
    else:
        raise RuntimeError('Unsupported input file format.')

    language = list()
    for lan in prun(get_chunk_tasks(chunks),
                    processes=processes,
                    initializer=LanguageTask.initialize):

        language.append(lan)

    language = pd.concat(language, axis=0)

    language.to_pickle(language_file)

    return
Exemplo n.º 2
0
def main(alto_fulltext_file, entropy_file, chunksize, processes):
    """
    Read the documents of the corpus from ALTO_FULLTEXT_FILE where each line of the .csv file
    describes one page.

    Foreach page compute its character entropy rate and store the result as a pickled pandas DataFrame
    in ENTROPY_FILE.
    """

    os.makedirs(os.path.dirname(entropy_file), exist_ok=True)

    if alto_fulltext_file.endswith('.csv'):
        chunks = get_csv_chunks(alto_fulltext_file, chunksize)
    elif alto_fulltext_file.endswith('.sqlite3'):
        chunks = get_sqlite_chunks(alto_fulltext_file, chunksize)
    else:
        raise RuntimeError('Unsupported input file format.')

    entropy = list()
    for et in prun(get_chunk_tasks(chunks), processes=processes):

        entropy.append(et)

    entropy = pd.concat(entropy, axis=0)

    entropy.to_pickle(entropy_file)

    return
Exemplo n.º 3
0
    def run(entities_file, embeddings, data_sequence, split_parts, processes, n_trees, distance_measure, output_path,
            search_k, max_dist, sem=None):

        return prun(LookUpBySurface._get_all(data_sequence, set(embeddings.keys()), split_parts, sem=sem),
                    processes=processes,
                    initializer=LookUpBySurface.initialize,
                    initargs=(entities_file, embeddings, n_trees, distance_measure, output_path, search_k, max_dist))
Exemplo n.º 4
0
    def run(entities_file,
            context_matrix_file,
            data_sequence_1,
            data_sequence_2,
            embeddings_1,
            ent_type_1,
            split_parts,
            n_trees,
            distance_measure_1,
            output_path,
            search_k_1,
            max_dist,
            lookup_semaphore,
            embeddings_2,
            ent_type_2,
            w_size,
            batch_size,
            embed_semaphore,
            processes,
            refine_processes=0):

        return \
            prun(RefineLookup._get_all(entities_file, data_sequence_1, data_sequence_2, embeddings_1, ent_type_1,
                                       split_parts, n_trees, distance_measure_1, output_path, search_k_1, max_dist,
                                       lookup_semaphore,
                                       embeddings_2, ent_type_2, w_size, batch_size, embed_semaphore, processes),
                 initializer=RefineLookup.initialize, initargs=(context_matrix_file,), processes=refine_processes)
Exemplo n.º 5
0
def collect(fulltext_file, selection_file, corpus_file, chunksize, processes, min_line_len):
    """
    Reads the fulltext from a CSV or SQLITE3 file (see also altotool) and write it to one big text file.

    FULLTEXT_FILE: The CSV or SQLITE3 file to read from.

    SELECTION_FILE: Consider only a subset of all pages that is defined by the DataFrame
    that is stored in <selection_file>.

    CORPUS_FILE: The output file that can be used by bert-pregenerate-trainingdata.
    """
    os.makedirs(os.path.dirname(corpus_file), exist_ok=True)

    print('Open {}.'.format(corpus_file))
    corpus_fh = codecs.open(corpus_file, 'w+', 'utf-8')
    corpus_fh.write(u'\ufeff')

    if fulltext_file.endswith('.csv'):
        chunks = get_csv_chunks(fulltext_file, chunksize)
    elif fulltext_file.endswith('.sqlite3'):
        chunks = get_sqlite_chunks(fulltext_file, chunksize)
    else:
        raise RuntimeError('Unsupported input file format.')

    for text in prun(get_chunk_tasks(chunks, min_line_len), processes=processes, initializer=ChunkTask.initialize,
                     initargs=(selection_file,)):

        corpus_fh.write(text)

    corpus_fh.close()

    return
Exemplo n.º 6
0
    def infinite_feature_sequence(self):

        features = []
        candidates = []
        current_entity = None

        for entity_id, candidate, fe in \
                prun(self.get_feature_tasks(), initializer=ConvertSamples2Features.initialize,
                     initargs=(self._tokenizer, self._max_seq_length), processes=self._feature_processes):

            if entity_id is None:
                yield current_entity, features, pd.concat(candidates) if len(candidates) > 0 else []

                features = []
                candidates = []
                current_entity = None
                continue

            if current_entity is None:
                current_entity = entity_id

            if fe is not None:
                features.append(fe)

            if candidate is not None:
                candidates.append(candidate)
Exemplo n.º 7
0
    def run(embeddings, data_sequence, ent_type, w_size, batch_size, processes, sem=None, start_iteration=0):

        for result in \
                prun(EmbedWithContext._get_all(data_sequence, start_iteration, ent_type, w_size, batch_size, sem),
                     processes=processes, initializer=EmbedWithContext.initialize, initargs=(embeddings,)):

            for _, link_result in result.iterrows():
                yield link_result
Exemplo n.º 8
0
    def run(index_file, mapping_file, distance_measure, search_k,
            embeddings, data_sequence, start_iteration, ent_type, w_size, batch_size, processes, sem=None):

        return prun(
            LookUpBySurfaceAndContext._get_all(embeddings, data_sequence, start_iteration, ent_type, w_size, batch_size,
                                               processes, sem), processes=3*processes,
            initializer=LookUpBySurfaceAndContext.initialize,
            initargs=(index_file, mapping_file, distance_measure, search_k))
Exemplo n.º 9
0
def sentence_stat(tsv_file, json_file, clef_gs_file, data_set_file, min_pairs,
                  max_pairs, processes):

    tsv = pd.read_csv(tsv_file, sep='\t', comment='#', quoting=3)
    tsv.loc[tsv.TOKEN.isnull(), 'TOKEN'] = ""

    tsv_gs = pd.read_csv(clef_gs_file, sep='\t', comment='#', quoting=3)
    tsv_gs.loc[tsv_gs.TOKEN.isnull(), 'TOKEN'] = ""

    with open(json_file, 'r') as fp_json:
        ned_result = json.load(fp_json)

    ned_result = add_ground_truth(ned_result, tsv, tsv_gs)

    applicable_results = sum([
        'gt' in entity_result and 'decision' in entity_result
        for _, entity_result in ned_result.items()
    ])

    rank_intervalls = np.linspace(0.001, 0.1, 100)
    quantiles = np.linspace(0.1, 1, 10)

    def get_tasks():

        nonlocal rank_intervalls, quantiles

        for entity_id, entity_result in ned_result.items():

            if 'gt' not in entity_result:
                continue

            if 'decision' not in entity_result:
                continue

            yield SentenceStatTask(entity_result, quantiles, rank_intervalls,
                                   min_pairs, max_pairs)

    progress = tqdm(prun(get_tasks(), processes=processes),
                    total=applicable_results)

    data = list()
    data_len = 0
    for data_part in progress:

        if data_part is None:
            continue

        data.append(data_part)
        data_len += len(data_part)

        progress.set_description("#data: {}".format(data_len))
        progress.refresh()

    data = pd.concat(data)

    data.to_pickle(data_set_file)
Exemplo n.º 10
0
    def get_lookup(self):

        for entity_id, ent_type, sentences, (_, embedded, embedding_config) in \
                prun(self.get_embed(), initializer=EmbedTask.initialize, initargs=(self._embeddings,),
                     processes=self._embed_processes):

            yield LookUpByEmbeddingWrapper(entity_id, sentences, page_title=entity_id, entity_embeddings=embedded,
                                           embedding_config=embedding_config, entity_title=entity_id,
                                           entity_type=ent_type, split_parts=self._split_parts,
                                           max_candidates=None)  # return all the candidates - filtering is done below
Exemplo n.º 11
0
def to_csv(source_dir, output_file, processes):

    with open(output_file, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['file_name', 'text', 'wc', 'ppn'])

        for filename, text, wc, ppn in prun(ExtractTask.get_all(source_dir),
                                            processes=processes):

            if filename is None:
                continue

            writer.writerow([filename, text, wc, ppn])
Exemplo n.º 12
0
    def infinite_feature_sequence(self):

        results = dict()

        for job_id, entity_id, candidate, fe in \
                prun(self.get_feature_tasks(), initializer=ConvertSamples2Features.initialize,
                     initargs=(self._tokenizer, self._max_seq_length), processes=self._feature_processes):

            self._queue_final_output.add_to_job(job_id,
                                                (entity_id, candidate, fe))

            while True:
                job_id, task_info, iter_quit = self._queue_final_output.get_next_task(
                )

                if iter_quit:
                    return

                if task_info is None:
                    break

                entity_id, candidate, fe, params = task_info

                if self._verbose:
                    print("infinite_feature_sequence: {}:{}".format(
                        job_id, entity_id))

                if job_id not in results:
                    results[job_id] = {
                        'features': [],
                        'candidates': [],
                        'entity_id': entity_id
                    }

                if entity_id is None:

                    result = results.pop(job_id)

                    yield job_id, (result['entity_id'], result['features'],
                                   (pd.concat(result['candidates'])
                                    if len(result['candidates']) > 0 else []))

                    continue

                if fe is not None:
                    results[job_id]['features'].append(fe)

                if candidate is not None:
                    results[job_id]['candidates'].append(candidate)
Exemplo n.º 13
0
def _sentence_stat(ned_result, tsv, tsv_gs, min_pairs, max_pairs, processes):
    ned_result = add_ground_truth(ned_result, tsv, tsv_gs)

    applicable_results = sum([
        'gt' in entity_result and 'decision' in entity_result
        for _, entity_result in ned_result.items()
    ])

    rank_intervalls = np.linspace(0.001, 0.1, 100)
    quantiles = np.linspace(0.1, 1, 10)

    def get_tasks():

        nonlocal rank_intervalls, quantiles

        for entity_id, entity_result in ned_result.items():

            if 'gt' not in entity_result:
                continue

            if 'decision' not in entity_result:
                continue

            yield SentenceStatTask(entity_result, quantiles, rank_intervalls,
                                   min_pairs, max_pairs)

    progress = tqdm(prun(get_tasks(), processes=processes),
                    total=applicable_results)

    data = list()
    data_len = 0
    for data_part in progress:

        if data_part is None:
            continue

        data.append(data_part)
        data_len += len(data_part)

        progress.set_description("#data: {}".format(data_len))
        progress.refresh()

    if len(data) < 1:
        return pd.DataFrame()

    data = pd.concat(data)

    return data
Exemplo n.º 14
0
def to_sqlite(source_dir, output_file, processes):

    with sqlite3.connect(output_file) as conn:

        conn.execute('pragma journal_mode=wal')

        for idx, (filename, text, wc, ppn) in\
                enumerate(prun(ExtractTask.get_all(source_dir), processes=processes)):

            if filename is None:
                continue

            pd.DataFrame({'id': idx, 'file_name': filename, 'text': text, 'wc': wc, 'ppn': ppn}, index=[idx]).\
                reset_index(drop=True).set_index('id').\
                to_sql('text', con=conn, if_exists='append', index_label='id')

        conn.execute('create index idx_ppn on text(ppn);')
Exemplo n.º 15
0
    def get_sentence_pairs(self):

        for job_id, entity_id, candidate, pairs in \
                prun(self.get_sentence_lookup(), initializer=SentenceLookup.initialize,
                     initargs=(self._ned_sql_file, ), processes=self._pairing_processes):

            if entity_id is None:
                # signal entity_id == None
                self._queue_pairs.add_to_job(job_id, (entity_id, None, None))
            else:

                if pairs is None:
                    self._queue_pairs.add_to_job(job_id,
                                                 (entity_id, candidate, None))
                else:

                    for idx, row in pairs.iterrows():
                        pair = (row.id_a, row.id_b, json.loads(row.sen_a),
                                json.loads(row.sen_b), row.pos_a, row.pos_b,
                                row.end_a, row.end_b, row.label)

                        self._queue_pairs.add_to_job(
                            job_id, (entity_id, candidate, pair))

                        candidate = None

            while True:
                job_id, task_info, iter_quit = self._queue_pairs.get_next_task(
                )

                if iter_quit:
                    return

                if task_info is None:
                    break

                entity_id, candidate, pair, params = task_info

                if self._verbose:
                    print("get_sentence_pairs: {}:{}".format(
                        job_id, entity_id))

                yield job_id, entity_id, candidate, pair
Exemplo n.º 16
0
    def process_sequence(self):

        complete_result = OrderedDict()

        for eid, result in prun(self.get_decider_tasks(),
                                initializer=DeciderTask.initialize,
                                initargs=(self._decider, self._entities),
                                processes=self._decider_processes):

            if eid is None:
                print('process_sequence done.')
                yield complete_result
                complete_result = OrderedDict()
                continue

            if result is None:
                continue

            complete_result[eid] = result
Exemplo n.º 17
0
def altoannotator(tagged_sqlite_file, source_dir, dest_dir, processes,
                  no_gzip):
    """
    Read NER tagging results from TAGGED_SQLITE_FILE.
    Read ALTO XML files in subfolders of directory SOURCE_DIR.
    Annotate the XML content with NER information and write the annotated ALTO XML back to the same directory
    structure in DEST_DIR.
    """

    dest_dir = "{}/{}".format(
        dest_dir,
        os.path.splitext(os.path.basename(tagged_sqlite_file))[0])

    os.makedirs(dest_dir, exist_ok=True)

    for _ in prun(AnnotateTask.get_all(source_dir, dest_dir, no_gzip),
                  processes=processes,
                  initializer=AnnotateTask.initialize,
                  initargs=(tagged_sqlite_file, )):
        pass
Exemplo n.º 18
0
    def infinite_process_sequence(self):

        for job_id, (eid, result) in \
                prun(self.get_decider_tasks(), initializer=DeciderTask.initialize,
                     initargs=(self._decider, self._entities), processes=self._decider_processes):

            self._queue_final_output.add_to_job(job_id, (eid, result))

            while True:
                job_id, task_info, iter_quit = self._queue_final_output.get_next_task(
                )

                if iter_quit:
                    return

                if task_info is None:
                    break

                eid, result, params = task_info

                yield job_id, (eid, result)
Exemplo n.º 19
0
    def get_decider_tasks(self):

        for entity_id, decision, candidates in prun(
                self.get_classifier_tasks(),
                initializer=ClassifierTask.initialize,
                initargs=(self._no_cuda, self._model_dir, self._model_file,
                          self._batch_size),
                processes=self._classifier_processes):

            if candidates is None:
                yield DeciderTask(entity_id=None,
                                  decision=None,
                                  candidates=None,
                                  quantiles=None,
                                  rank_intervalls=None,
                                  threshold=None)
                continue

            yield DeciderTask(entity_id, decision, candidates, self._quantiles,
                              self._rank_intervalls, self._threshold,
                              self._return_full)
Exemplo n.º 20
0
    def get_sentence_pairs(self):

        for entity_id, candidate, pairs in \
                prun(self.get_sentence_lookup(), initializer=SentenceLookup.initialize,
                     initargs=(self._ned_sql_file, ), processes=self._pairing_processes):

            if entity_id is None:

                # signal entity_id == None
                yield None, None,  None
                continue

            if pairs is None:
                continue

            for idx, row in pairs.iterrows():

                pair = (row.id_a, row.id_b, json.loads(row.sen_a), json.loads(row.sen_b),
                        row.pos_a, row.pos_b, row.end_a, row.end_b, row.label)

                yield entity_id, candidate, pair

                candidate = None
Exemplo n.º 21
0
    def get_sentence_lookup(self):

        for sentences, (entity_id, candidates) in \
                prun(self.get_lookup(), initializer=LookUpByEmbeddings.initialize,
                     initargs=(self._entities_file, self._entity_types, self._n_trees, self._distance_measure,
                               self._entity_index_path, self._search_k, self._max_dist),
                     processes=self._lookup_processes):

            if entity_id is None:
                # signal entity_id == None
                yield SentenceLookupWrapper(entity_id=None)
                continue

            candidates = candidates.merge(self._entities[['proba']], left_on="guessed_title", right_index=True)

            candidates = candidates.\
                sort_values(['match_uniqueness', 'dist', 'proba', 'match_coverage', 'len_guessed'],
                            ascending=[False, True, False, False, True])

            candidates = candidates.iloc[0:self._max_candidates]

            for idx in range(0, len(candidates)):
                yield SentenceLookupWrapper(entity_id, sentences=sentences, candidates=candidates.iloc[[idx]],
                                            max_pairs=self._max_pairs)
Exemplo n.º 22
0
    def get_decider_tasks(self):

        for job_id, entity_id, decision, candidates in \
                prun(self.get_classifier_tasks(), initializer=ClassifierTask.initialize,
                     initargs=(self._no_cuda, self._model_dir, self._model_file, self._batch_size),
                     processes=self._classifier_processes):

            self._queue_decider.add_to_job(job_id,
                                           (entity_id, decision, candidates))

            while True:
                job_id, task_info, iter_quit = self._queue_decider.get_next_task(
                )

                if iter_quit:
                    return

                if task_info is None:
                    break

                entity_id, decision, candidates, params = task_info

                print("get_decider_tasks: {}:{}".format(job_id, entity_id))

                if entity_id is None:
                    continue
                if candidates is None:
                    continue

                yield DeciderTaskWrapper(job_id,
                                         entity_id=entity_id,
                                         decision=decision,
                                         candidates=candidates,
                                         quantiles=self._quantiles,
                                         rank_intervalls=self._rank_intervalls,
                                         **params)
Exemplo n.º 23
0
    def run(embeddings, all_entities, split_parts, processes):

        return prun(EmbedTask._get_all(all_entities, split_parts),
                    processes=processes,
                    initializer=EmbedTask.initialize,
                    initargs=(embeddings, ))
Exemplo n.º 24
0
def on_db_file(fulltext_sqlite_file, selection_file, model_name, ner_endpoint, chunksize, noproxy,
               processes, outfile):
    """
    Reads the text content per page of digitalized collections from sqlite file FULLTEXT_SQLITE_FILE.

    Considers only a subset of documents that is defined by SELECTION_FILE.

    Performs NER on the text content using the REST endpoint[s] NER_ENDPOINT ....

    Writes the NER results back to another sqlite file whose name is equal to FULLTEXT_SQLITE_FILE + '-ner-'
    or to the file specified in the --outfile option.

    Writes results in chunks of size <chunksize>.

    Suppress proxy with option --noproxy.
    """

    if noproxy:
        os.environ['no_proxy'] = '*'

    logging.info('Using endpoints: {}'.format(ner_endpoint))

    model_name = model_name.replace(" ", "")

    ner_endpoint_tmp = []
    for endpoint in ner_endpoint:

        models = json.loads(requests.get("{}/models".format(endpoint)).content)

        models = pd.DataFrame.from_dict(models)[['name', 'id']]

        models['name'] = models['name'].str.replace(" ", "")

        models = models.set_index('name')

        ner_endpoint_tmp.append("{}/ner/{}".format(endpoint, models.loc[model_name]['id']))

    ner_endpoint = ner_endpoint_tmp

    if outfile is None:
        tagged_sqlite_file = os.path.splitext(
            os.path.basename(fulltext_sqlite_file))[0] + "-ner-" + model_name + ".sqlite3"
    else:
        tagged_sqlite_file = outfile

    start_row = 0
    if os.path.exists(tagged_sqlite_file):

        with create_connection(tagged_sqlite_file) as read_conn:

            start_row = read_conn.execute('select max(id) from tagged').fetchone()[0] + 1

            logger.info('Starting from idx: {}'.format(start_row))

    with create_connection(tagged_sqlite_file) as write_conn:

        tagged = []

        for num, ppn, file_name, text, tags, original_text, received_text in\
            prun(NERTask.get_all(fulltext_sqlite_file, selection_file, ner_endpoint, start_row),
                 processes=len(ner_endpoint) if processes is None else processes):

            tagged.append({'id': num, 'ppn': ppn, 'file_name': file_name, 'text': text, 'tags': tags})

            try:
                assert original_text == received_text
            except AssertionError:
                logging.warning('PPN: {}, file_name: {}\n\n\nInput and output differ:\n\nInput: {}\n\nOutput:{}'.
                                format(ppn, file_name, original_text, received_text))

            if len(tagged) > chunksize:
                # noinspection PyTypeChecker
                df_tagged = pd.DataFrame.from_dict(tagged).reset_index(drop=True).set_index('id')

                df_tagged.to_sql('tagged', con=write_conn, if_exists='append', index_label='id')

                tagged = []

        if len(tagged) > 0:
            # noinspection PyTypeChecker
            df_tagged = pd.DataFrame.from_dict(tagged).reset_index(drop=True).set_index('id')

            df_tagged.to_sql('tagged', con=write_conn, if_exists='append', index_label='id')

        try:
            write_conn.execute('create index idx_ppn on tagged(ppn);')
        except sqlite3.OperationalError:
            pass

    return
Exemplo n.º 25
0
def ned_sentence_data(tagged_sqlite_file, ned_sqlite_file, processes,
                      writequeue):
    """

    TAGGED_SQLITE_FILE: A sqlite database file that contains all wikipedia articles where the relevant
    entities have been tagged. This is a database that gives per article access to the tagged sentences,
    it can be created using 'tag-wiki-entities2sqlite'.


    NED_SQLITE_FILE: Output database. This database gives fast per entity and per sentence access, i.e., it
    provides a fast answer to the question: "Give me all sentences where entity X is discussed."

    """

    first_write = True

    sentence_counter = 0
    link_counter = 0

    # prevent infinite growth of multiprocessing queue
    sem = Semaphore(writequeue)

    with sqlite3.connect(ned_sqlite_file) as write_conn:

        write_conn.execute('pragma journal_mode=wal')

        for df_sentence, df_linking in prun(NEDDataTask.get_all(
                tagged_sqlite_file, sem=sem),
                                            processes=processes):

            if df_sentence is None:
                sem.release()
                continue

            df_sentence['id'] += sentence_counter
            df_linking['sentence'] += sentence_counter
            df_linking['id'] = [
                link_counter + i for i in range(len(df_linking))
            ]

            sentence_counter += len(df_sentence)
            link_counter += len(df_linking)

            df_sentence.set_index('id').to_sql('sentences',
                                               con=write_conn,
                                               if_exists='append',
                                               index_label='id')
            df_linking.set_index('id').to_sql('links',
                                              con=write_conn,
                                              if_exists='append',
                                              index_label='id')

            if first_write:
                write_conn.execute('create index idx_target on links(target);')
                write_conn.execute(
                    'create index idx_sentence on links(sentence);')
                write_conn.execute(
                    'create index idx_page_id on sentences(page_id);')
                write_conn.execute(
                    'create index idx_page_title on sentences(page_title);')

                first_write = False

            sem.release()