def compute_stanford_ner(source_file, service_url=STANFORD_CORE_NLP_URL, excludes=None): excludes = excludes or ('O', ) assert os.path.isfile(source_file), 'File missing!' tagger = corenlp.CoreNLPParser(url=service_url, encoding='utf8', tagtype='ner') reader = text_corpus.CompressedFileReader(source_file) document_index = domain_logic.compile_documents_by_filename( reader.filenames) stream = domain_logic.get_document_stream(reader, 'en', document_index=document_index) i = 0 ner_data = [] for filename, text, metadata in stream: print(filename) document_id = document_index.loc[document_index.filename == filename, 'document_id'].values[0] ner = recognize_named_entities(tagger, document_id, text, excludes) ner_data.extend(ner) i += 1 if i % 10 == 0: logger.info('Processed {} files...'.format(i)) break return ner_data
def get_document_stream(source, lang, document_index=None, id_extractor=None): assert document_index is not None if 'document_id' not in document_index.columns: document_index['document_id'] = document_index.index id_extractor = lambda filename: filename.split('_')[0] lang_pattern = re.compile("^(\w*)\_" + lang + "([\_\-]corr)?\.txt$") item_filter = lambda x: lang_pattern.match( x) # and id_extractor(x) in document_index.index if isinstance(source, str): print('Opening archive: {}'.format(source)) reader = text_corpus.CompressedFileReader(source, pattern=lang_pattern, itemfilter=item_filter) else: reader = source id_map = { filename: id_extractor(filename) for filename in reader.filenames if item_filter(filename) } if len(set(document_index.index) - set(id_map.values())) > 0: logger.warning( 'Treaties not found in archive: ' + ', '.join(list(set(document_index.index) - set(id_map.values())))) columns = ['signed_year', 'party1', 'party2'] df = document_index[columns] for filename, text in reader: document_id = id_map.get(filename, None) if document_id not in df.index: continue metadata = df.loc[document_id].to_dict() metadata['filename'] = filename metadata['document_id'] = document_id metadata['treaty_id'] = document_id yield filename, document_id, text, metadata
def get_document_stream(source, lang, **kwargs): id_map = {} if isinstance(source, str): # FIXME Use "smart_open" or "open_sesame" library instead reader = text_corpus.CompressedFileReader(source) else: reader = source lookup = compile_documents_by_filename( reader.filenames).set_index('filename') lookup['filename'] = lookup.index row_id = 0 for filename, text in reader: metadata = lookup.loc[filename].to_dict() yield filename, text, metadata row_id += 1
def get_document_stream(source, lang, **kwargs): reader = text_corpus.CompressedFileReader(source) if isinstance( source, str) else source df_corpus_index = compile_unesco_corpus_index(reader) #reader.filenames = sorted(list(df_corpus_index[(df_corpus_index.year//10).isin([194, 195, 196, 197, 198])].filename.values)) logger.info('Note! Filter is applied: Only first file for each year.') reader.filenames = list( df_corpus_index[df_corpus_index.filename.str.contains('en')].groupby( 'year')['filename'].min().values) df_corpus_index = df_corpus_index.loc[df_corpus_index.filename.isin( reader.filenames)].sort_values('filename') assert len(reader.filenames) == len(df_corpus_index) n_words_threshold = kwargs.get('n_words_threshold', 10) n_bytes_trunc_size = kwargs.get('n_bytes_trunc_size', None) logger.info('INFO: Files having less than {} words will be skipped'.format( n_words_threshold)) logger.info('INFO: Files greater than {} bytes will be truncated'.format( n_bytes_trunc_size)) processed_count = 0 empty_count = 0 truncated_count = 0 yielded_count = 0 for filename, text in reader: processed_count += 1 local_number, lang = split_name(filename) local_number = int(local_number) metadata = df_corpus_index.loc[local_number].to_dict() if metadata['n_words'] < n_words_threshold: #logger.info('WARNING: Skipping empty file {} '.format(filename)) empty_count += 1 continue if metadata['lang'] != 'en': logger.info('WARNING: Skipping file (unknown language) {} '.format( filename)) continue if n_bytes_trunc_size is not None and metadata[ 'n_bytes'] > n_bytes_trunc_size: text = text[:n_bytes_trunc_size] truncated_count += 1 # i = 0 # for segment in chunk_text(text, step=50000): # basename = str(local_number).zfill(10) + '_' + str(i).zfill(3) # yield filename, text[:100000], metadata # i += 1 yield filename, text, metadata yielded_count += 1 logger.info( 'Corpus read done: {} processed files, {} empty files, {} truncated files, {} files yielded' .format(processed_count, empty_count, truncated_count, yielded_count))
def generate_textacy_corpus(data_folder, wti_index, container, source_path, language, merge_entities, overwrite=False, period_group='years_1935-1972', treaty_filter='', parties=None, disabled_pipes=None, tick=utility.noop, treaty_sources=None): for key in container.__dict__: container.__dict__[key] = None nlp_args = {'disable': disabled_pipes or []} container.source_path = source_path container.language = language container.textacy_corpus = None container.prepped_source_path = utility.path_add_suffix( source_path, '_preprocessed') if overwrite or not os.path.isfile(container.prepped_source_path): textacy_utility.preprocess_text(container.source_path, container.prepped_source_path, tick=tick) container.textacy_corpus_path = textacy_utility.generate_corpus_filename( container.prepped_source_path, container.language, nlp_args=nlp_args, compression=None, period_group=period_group) container.nlp = textacy_utility.setup_nlp_language_model( container.language, **nlp_args) if overwrite or not os.path.isfile(container.textacy_corpus_path): logger.info('Working: Computing new corpus ' + container.textacy_corpus_path + '...') treaties = wti_index.get_treaties(language=container.language, period_group=period_group, treaty_filter=treaty_filter, parties=parties, treaty_sources=treaty_sources) reader = text_corpus.CompressedFileReader( container.prepped_source_path) stream = domain_logic.get_document_stream(reader, container.language, document_index=treaties) logger.info('Working: Stream created...') tick(0, len(treaties)) container.textacy_corpus = textacy_utility.create_textacy_corpus( stream, container.nlp, tick) container.textacy_corpus.save(container.textacy_corpus_path) tick(0) else: logger.info('Working: Loading corpus ' + container.textacy_corpus_path + '...') tick(1, 2) container.textacy_corpus = textacy.Corpus.load( container.nlp, container.textacy_corpus_path) logger.info('Loaded corpus with {} documents.'.format( len(container.textacy_corpus))) tick(0) if merge_entities: logger.info('Working: Merging named entities...') for doc in container.textacy_corpus: named_entities = textacy.extract.named_entities(doc) merge_spans(named_entities, doc.spacy_doc) else: logger.info('Named entities not merged') logger.info('Done!')
def generate_textacy_corpus( domain_logic, data_folder, container, document_index, # data_frame or lambda corpus: corpus_index source_path, language, merge_entities, overwrite=False, binary_format=True, use_compression=True, disabled_pipes=None, tick=utility.noop): for key in container.__dict__: container.__dict__[key] = None nlp_args = {'disable': disabled_pipes or []} store_format = 'binary' if binary_format else 'pickle' store_extension = 'bin' if binary_format else 'pkl' store_compression = 'bz2' if use_compression else '' container.source_path = source_path container.language = language container.textacy_corpus = None container.prepped_source_path = utility.path_add_suffix( source_path, '_preprocessed') if not os.path.isfile(container.prepped_source_path): textacy_utility.preprocess_text(container.source_path, container.prepped_source_path, tick=tick) container.textacy_corpus_path = textacy_utility.generate_corpus_filename( container.prepped_source_path, container.language, nlp_args=nlp_args, extension=store_extension, compression=store_compression) container.nlp = textacy_utility.setup_nlp_language_model( container.language, **nlp_args) if overwrite or not os.path.isfile(container.textacy_corpus_path): logger.info('Working: Computing new corpus ' + container.textacy_corpus_path + '...') #FIXME VARYING ASPECTS: reader = text_corpus.CompressedFileReader( container.prepped_source_path) stream = domain_logic.get_document_stream( reader, container.language, document_index=document_index) logger.info('Working: Stream created...') if False: tick(0, len(reader.filenames)) container.textacy_corpus = textacy_utility.create_textacy_corpus( stream, container.nlp, tick) logger.info('storing corpus (this might take some time)...') textacy_utility.save_corpus(container.textacy_corpus, container.textacy_corpus_path, format=store_format) tick(0) else: textacy_utility.create_textacy_corpus_streamed( stream, container.nlp, container.textacy_corpus_path, format='binary', tick=utility.noop) container.textacy_corpus = textacy_utility.load_corpus( container.textacy_corpus_path, container.nlp, format='binary') else: logger.info('Working: Loading corpus ' + container.textacy_corpus_path + '...') tick(1, 2) logger.info('...reading corpus (this might take several minutes)...') container.textacy_corpus = textacy_utility.load_corpus( container.textacy_corpus_path, container.nlp, format=store_format) tick(0) if merge_entities: logger.info('Working: Merging named entities...') try: for doc in container.textacy_corpus: named_entities = textacy.extract.named_entities(doc) textacy.spacier.utils.merge_spans(named_entities, doc.spacy_doc) except Exception as ex: logger.error(ex) logger.info('NER merge failed') else: logger.info('Named entities not merged') logger.info('Done!')
def create_source_stream(source_path, lang, document_index=None): reader = text_corpus.CompressedFileReader(source_path) stream = domain_logic.get_document_stream(reader, lang, document_index=document_index) return stream