예제 #1
0
def compute_stanford_ner(source_file,
                         service_url=STANFORD_CORE_NLP_URL,
                         excludes=None):

    excludes = excludes or ('O', )

    assert os.path.isfile(source_file), 'File missing!'

    tagger = corenlp.CoreNLPParser(url=service_url,
                                   encoding='utf8',
                                   tagtype='ner')

    reader = text_corpus.CompressedFileReader(source_file)
    document_index = domain_logic.compile_documents_by_filename(
        reader.filenames)
    stream = domain_logic.get_document_stream(reader,
                                              'en',
                                              document_index=document_index)

    i = 0
    ner_data = []
    for filename, text, metadata in stream:
        print(filename)
        document_id = document_index.loc[document_index.filename == filename,
                                         'document_id'].values[0]
        ner = recognize_named_entities(tagger, document_id, text, excludes)
        ner_data.extend(ner)
        i += 1
        if i % 10 == 0:
            logger.info('Processed {} files...'.format(i))
        break
    return ner_data
def get_document_stream(source, lang, document_index=None, id_extractor=None):

    assert document_index is not None

    if 'document_id' not in document_index.columns:
        document_index['document_id'] = document_index.index

    id_extractor = lambda filename: filename.split('_')[0]
    lang_pattern = re.compile("^(\w*)\_" + lang + "([\_\-]corr)?\.txt$")
    item_filter = lambda x: lang_pattern.match(
        x)  # and id_extractor(x) in document_index.index

    if isinstance(source, str):
        print('Opening archive: {}'.format(source))
        reader = text_corpus.CompressedFileReader(source,
                                                  pattern=lang_pattern,
                                                  itemfilter=item_filter)
    else:
        reader = source

    id_map = {
        filename: id_extractor(filename)
        for filename in reader.filenames if item_filter(filename)
    }

    if len(set(document_index.index) - set(id_map.values())) > 0:
        logger.warning(
            'Treaties not found in archive: ' +
            ', '.join(list(set(document_index.index) - set(id_map.values()))))

    columns = ['signed_year', 'party1', 'party2']

    df = document_index[columns]

    for filename, text in reader:

        document_id = id_map.get(filename, None)

        if document_id not in df.index:
            continue

        metadata = df.loc[document_id].to_dict()

        metadata['filename'] = filename
        metadata['document_id'] = document_id
        metadata['treaty_id'] = document_id

        yield filename, document_id, text, metadata
def get_document_stream(source, lang, **kwargs):

    id_map = {}

    if isinstance(source, str):
        # FIXME Use "smart_open" or "open_sesame" library instead
        reader = text_corpus.CompressedFileReader(source)
    else:
        reader = source

    lookup = compile_documents_by_filename(
        reader.filenames).set_index('filename')
    lookup['filename'] = lookup.index

    row_id = 0
    for filename, text in reader:
        metadata = lookup.loc[filename].to_dict()
        yield filename, text, metadata
        row_id += 1
예제 #4
0
def get_document_stream(source, lang, **kwargs):

    reader = text_corpus.CompressedFileReader(source) if isinstance(
        source, str) else source

    df_corpus_index = compile_unesco_corpus_index(reader)

    #reader.filenames = sorted(list(df_corpus_index[(df_corpus_index.year//10).isin([194, 195, 196, 197, 198])].filename.values))

    logger.info('Note! Filter is applied: Only first file for each year.')
    reader.filenames = list(
        df_corpus_index[df_corpus_index.filename.str.contains('en')].groupby(
            'year')['filename'].min().values)

    df_corpus_index = df_corpus_index.loc[df_corpus_index.filename.isin(
        reader.filenames)].sort_values('filename')

    assert len(reader.filenames) == len(df_corpus_index)

    n_words_threshold = kwargs.get('n_words_threshold', 10)
    n_bytes_trunc_size = kwargs.get('n_bytes_trunc_size', None)

    logger.info('INFO: Files having less than {} words will be skipped'.format(
        n_words_threshold))
    logger.info('INFO: Files greater than {} bytes will be truncated'.format(
        n_bytes_trunc_size))

    processed_count = 0
    empty_count = 0
    truncated_count = 0
    yielded_count = 0

    for filename, text in reader:

        processed_count += 1

        local_number, lang = split_name(filename)
        local_number = int(local_number)

        metadata = df_corpus_index.loc[local_number].to_dict()

        if metadata['n_words'] < n_words_threshold:
            #logger.info('WARNING: Skipping empty file {} '.format(filename))
            empty_count += 1
            continue

        if metadata['lang'] != 'en':
            logger.info('WARNING: Skipping file (unknown language) {} '.format(
                filename))
            continue

        if n_bytes_trunc_size is not None and metadata[
                'n_bytes'] > n_bytes_trunc_size:
            text = text[:n_bytes_trunc_size]
            truncated_count += 1

#        i = 0
#        for segment in chunk_text(text, step=50000):
#            basename = str(local_number).zfill(10) + '_' + str(i).zfill(3)
#            yield filename, text[:100000], metadata
#            i += 1

        yield filename, text, metadata
        yielded_count += 1

    logger.info(
        'Corpus read done: {} processed files, {} empty files, {} truncated files, {} files yielded'
        .format(processed_count, empty_count, truncated_count, yielded_count))
예제 #5
0
def generate_textacy_corpus(data_folder,
                            wti_index,
                            container,
                            source_path,
                            language,
                            merge_entities,
                            overwrite=False,
                            period_group='years_1935-1972',
                            treaty_filter='',
                            parties=None,
                            disabled_pipes=None,
                            tick=utility.noop,
                            treaty_sources=None):

    for key in container.__dict__:
        container.__dict__[key] = None

    nlp_args = {'disable': disabled_pipes or []}

    container.source_path = source_path
    container.language = language
    container.textacy_corpus = None
    container.prepped_source_path = utility.path_add_suffix(
        source_path, '_preprocessed')

    if overwrite or not os.path.isfile(container.prepped_source_path):
        textacy_utility.preprocess_text(container.source_path,
                                        container.prepped_source_path,
                                        tick=tick)

    container.textacy_corpus_path = textacy_utility.generate_corpus_filename(
        container.prepped_source_path,
        container.language,
        nlp_args=nlp_args,
        compression=None,
        period_group=period_group)

    container.nlp = textacy_utility.setup_nlp_language_model(
        container.language, **nlp_args)

    if overwrite or not os.path.isfile(container.textacy_corpus_path):

        logger.info('Working: Computing new corpus ' +
                    container.textacy_corpus_path + '...')
        treaties = wti_index.get_treaties(language=container.language,
                                          period_group=period_group,
                                          treaty_filter=treaty_filter,
                                          parties=parties,
                                          treaty_sources=treaty_sources)
        reader = text_corpus.CompressedFileReader(
            container.prepped_source_path)

        stream = domain_logic.get_document_stream(reader,
                                                  container.language,
                                                  document_index=treaties)

        logger.info('Working: Stream created...')

        tick(0, len(treaties))
        container.textacy_corpus = textacy_utility.create_textacy_corpus(
            stream, container.nlp, tick)
        container.textacy_corpus.save(container.textacy_corpus_path)
        tick(0)

    else:
        logger.info('Working: Loading corpus ' +
                    container.textacy_corpus_path + '...')
        tick(1, 2)
        container.textacy_corpus = textacy.Corpus.load(
            container.nlp, container.textacy_corpus_path)
        logger.info('Loaded corpus with {} documents.'.format(
            len(container.textacy_corpus)))

        tick(0)

    if merge_entities:
        logger.info('Working: Merging named entities...')
        for doc in container.textacy_corpus:
            named_entities = textacy.extract.named_entities(doc)
            merge_spans(named_entities, doc.spacy_doc)
    else:
        logger.info('Named entities not merged')

    logger.info('Done!')
예제 #6
0
def generate_textacy_corpus(
        domain_logic,
        data_folder,
        container,
        document_index,  # data_frame or lambda corpus: corpus_index
        source_path,
        language,
        merge_entities,
        overwrite=False,
        binary_format=True,
        use_compression=True,
        disabled_pipes=None,
        tick=utility.noop):

    for key in container.__dict__:
        container.__dict__[key] = None

    nlp_args = {'disable': disabled_pipes or []}

    store_format = 'binary' if binary_format else 'pickle'
    store_extension = 'bin' if binary_format else 'pkl'
    store_compression = 'bz2' if use_compression else ''

    container.source_path = source_path
    container.language = language
    container.textacy_corpus = None
    container.prepped_source_path = utility.path_add_suffix(
        source_path, '_preprocessed')

    if not os.path.isfile(container.prepped_source_path):
        textacy_utility.preprocess_text(container.source_path,
                                        container.prepped_source_path,
                                        tick=tick)

    container.textacy_corpus_path = textacy_utility.generate_corpus_filename(
        container.prepped_source_path,
        container.language,
        nlp_args=nlp_args,
        extension=store_extension,
        compression=store_compression)

    container.nlp = textacy_utility.setup_nlp_language_model(
        container.language, **nlp_args)

    if overwrite or not os.path.isfile(container.textacy_corpus_path):

        logger.info('Working: Computing new corpus ' +
                    container.textacy_corpus_path + '...')

        #FIXME VARYING ASPECTS:
        reader = text_corpus.CompressedFileReader(
            container.prepped_source_path)

        stream = domain_logic.get_document_stream(
            reader, container.language, document_index=document_index)

        logger.info('Working: Stream created...')

        if False:
            tick(0, len(reader.filenames))

            container.textacy_corpus = textacy_utility.create_textacy_corpus(
                stream, container.nlp, tick)

            logger.info('storing corpus (this might take some time)...')
            textacy_utility.save_corpus(container.textacy_corpus,
                                        container.textacy_corpus_path,
                                        format=store_format)

            tick(0)
        else:
            textacy_utility.create_textacy_corpus_streamed(
                stream,
                container.nlp,
                container.textacy_corpus_path,
                format='binary',
                tick=utility.noop)
            container.textacy_corpus = textacy_utility.load_corpus(
                container.textacy_corpus_path, container.nlp, format='binary')

    else:
        logger.info('Working: Loading corpus ' +
                    container.textacy_corpus_path + '...')
        tick(1, 2)

        logger.info('...reading corpus (this might take several minutes)...')
        container.textacy_corpus = textacy_utility.load_corpus(
            container.textacy_corpus_path, container.nlp, format=store_format)
        tick(0)

    if merge_entities:
        logger.info('Working: Merging named entities...')
        try:
            for doc in container.textacy_corpus:
                named_entities = textacy.extract.named_entities(doc)
                textacy.spacier.utils.merge_spans(named_entities,
                                                  doc.spacy_doc)
        except Exception as ex:
            logger.error(ex)
            logger.info('NER merge failed')
    else:
        logger.info('Named entities not merged')

    logger.info('Done!')
예제 #7
0
def create_source_stream(source_path, lang, document_index=None):
    reader = text_corpus.CompressedFileReader(source_path)
    stream = domain_logic.get_document_stream(reader, lang, document_index=document_index)
    return stream