def generate_html(event_texts, batch_size, n_process=-1, nlp=None, save=False): '''Multi-threaded to pipe list-like of strings into spaCy documents and return HTML in a list. arguments: -- event_texts : list of str (e.g. event_text column from events dataframe). -- batch_size : number of strings to process per batch on each process (afaik). -- n_process : number of workers/threads/cores on CPU (default = -1 : all available threads). -- nlp : pre-loaded spaCy language model. If None, load expected default configuration for this task. returns: list of HTML strings ''' if nlp == None: from pipeline.preprocessing.text import load_spacy_model nlp = load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False, geological_matcher=True, stopword_removal=False, punctuation_removal=False, lemmatize_triggers=True, verbose=False) # pipe text to list of docs #docs = list(nlp.pipe(event_texts, batch_size=batch_size, n_process=n_process)) html = [ display_ent(doc, jupyter=False) for doc in tqdm( nlp.pipe(event_texts, batch_size=batch_size, n_process=n_process), desc='Rendering spaCy entities as HTML') ] return html
def load_metadata_with_triggers(nlp=None, num_files=None, data_folder='data'): nlp = nlp or load_spacy_model(output_type='text', tokenizer_only=True) metadata = load_metadata() filenames = metadata.filename.tolist() if num_files != None: filenames = filenames[:num_files] file_triggers = match_triggers(filenames, nlp=nlp, save_json=False) data = file_triggers.merge(metadata, on='filename') return data
def load_event_data(nlp=None, cols=None, confidence='high'): nlp = nlp or load_spacy_model(output_type='text', lemmatizer=True, geological_matcher=False, stopword_removal=False, punctuation_removal=True, verbose=False) cols = default_event_cols if cols is None else cols df = pd.read_csv(f'data/events/events_{confidence}-conf.csv', index_col=0, usecols=cols) df['tokens'] = list(nlp.pipe(df.event_text.values)) return df
def extract_text_chunks(filenames, pad=2, skip_on_trigger=False, tokenize=False, nlp=None, n_process=-1, batch_size=100): # if a dictionary is loaded where keys are filenames and values are pre-loaded files, we dont load from disk if type(filenames) == dict: files = filenames else: # load report files from disk to extract events on triggers files = load_files(filenames, data_path='data/wamex_xml', output='dict') if skip_on_trigger: # have not implemented skipping over overlapping text chunks pass if tokenize: nlp = nlp or load_spacy_model(output_type='text', lemmatizer=True, geological_matcher=False, stopword_removal=False, punctuation_removal=True, lemmatize_triggers=True, verbose=False) files = { file: list( nlp.pipe(sentences, n_process=n_process, batch_size=batch_size)) for file, sentences in tqdm(files.items(), desc='Tokenizing file text') } return { file: [ ' '.join((pad * [''] + sentences + pad * [''])[idx:(1 + idx + (2 * pad))]).strip() for idx in range(len(sentences)) ] for file, sentences in tqdm(files.items(), desc='Extracting text chunks') }
def write_html(df, batch_size=100, n_process=-1, nlp=None): '''Multi-threaded to pipe events dataframe into spaCy documents and save pre-rendered .html files in data/html/spacy/''' import pandas as pd # load test data if type(df) == str: # if filename is passed, load file df = pd.read_csv(df, index_col=0) assert type(df) == pd.DataFrame if nlp == None: # load language model from pipeline.preprocessing.text import load_spacy_model nlp = load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False, geological_matcher=True, stopword_removal=False, punctuation_removal=False, lemmatize_triggers=True, verbose=False) event_html = { event_id: html for event_id, html in zip( df.event_id.values, generate_html(df.event_text.values, batch_size=batch_size, n_process=n_process)) } for event_id, html in tqdm( event_html.items(), desc='Saving pre-rendered .html to data/html/spacy/'): with open(os.path.join('data', 'html', 'spacy', f'{event_id}.html'), 'w+') as f: f.write(html)
def build_event_data(datasets: dict, confidence, pad=0, batch_size=100, n_process=6, nlp=None, labelled_ranges=True, group_all_labelled=False, named_entities=None, return_entities=True, geoview=None, capstone_files=None, files=None): # load files if files are not provided if (type(capstone_files) != pd.DataFrame) | (files == None): capstone_files, files = get_report_data(count_sentences=False, return_files=True) # merge datasets provided by individual labellers df = merge_datasets(datasets, confidence=confidence) # apply the build event text function to build text chunk from labelled sentences df.insert( 6, 'event_text', df.apply(lambda row: build_event_text( row, pad=2, labelled_ranges=labelled_ranges, files=files), axis=1)) # insert the event_id natural key which is f'{filename}_{idx}' df.insert( 0, 'event_id', df.apply(lambda row: '_'.join( [row.filename.rsplit('.', 1)[0], str(row.sentence_idx)]), axis=1)) # return the text chunk start and end positions, or lower_bound/upper_bound if labelled_ranges: df['lower_idx'] = df['sentence_idx'] + df['lower_bound'] df['upper_idx'] = df['sentence_idx'] + df['upper_bound'] else: df['lower_idx'] = df['sentence_idx'] - pad df['upper_idx'] = df['sentence_idx'] + pad df.rename(columns={'triggers': 'sentence_triggers'}, inplace=True) df.drop(columns=['lower_bound', 'upper_bound'], inplace=True) # load old event labels from group labelling early in semester if group_all_labelled and confidence.lower( ) != 'high': # only returns old ones if conf = medium old_events = load_group_all_labelled(geoview=geoview, capstone_files=capstone_files, files=files) final_index = df.index[-1] old_events.index = np.arange(df.index[-1], len(old_events) + df.index[-1]) df = df.append(old_events) # run named entity recognition with spacy on text chunk if return_entities: nlp = nlp or load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False, geological_matcher=True, stopword_removal=False, punctuation_removal=False, lemmatize_triggers=True) named_entities = named_entities or [ 'DATE', 'LOCATION', 'TRIGGER', 'STRAT', 'ROCK', 'LOCATION', 'MINERAL', 'ORE_DEPOSIT', 'TIMESCALE' ] # create a list of tuples for each entity in each event id event_entities = [ (event_id, ent.text, ent.label_) for event_id, doc in tqdm( zip( df.event_id.values, nlp.pipe(df.event_text.values, batch_size=batch_size, n_process=n_process)), desc=f'Extracting {confidence} confidence events') for ent in doc.ents if ent.label_ in named_entities ] # join entity labels together as a string and then merge onto original dataframe df = df.merge(pd.DataFrame( data=event_entities, columns=['event_id', 'entity', 'label']).groupby([ 'event_id', 'label' ]).apply(lambda x: ', '.join(x.entity)).unstack(level='label'), on='event_id', how='left').fillna('') assert all( files[event.filename][event.sentence_idx] in event.event_text for event in df.itertuples( )), f'sentences not matched in {confidence} confidence event text' return df
group_all_labelled = args.events print( f'Building training data from labelled near miss instances by: {" ".join(user for user in users)}' ) print( f'Confidence thresholds to label text chunks as near miss: {" ".join(conf for conf in confs)}' ) if group_all_labelled: print('Including old labelled training data.') # load spacy model nlp = load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False, geological_matcher=True, stopword_removal=False, punctuation_removal=False, lemmatize_triggers=True, verbose=False) # load files and geoview metadata capstone_files, files = get_report_data(count_sentences=True, return_files=True) metadata = pd.read_csv('data/geoview/capstone_metadata.zip', compression='zip', parse_dates=['report_year'], usecols=[ 'anumber', 'title', 'report_type', 'project', 'keywords', 'commodity', 'report_year' ])