def buildGraph(neighbor_files, k): log.writeln('Building neighborhood graph...') graph = {} # construct frequency-weighted edges log.track(message=' >> Loaded {0}/%d neighborhood files' % len(neighbor_files), writeInterval=1) for neighbor_file in neighbor_files: neighborhoods = readNeighbors(neighbor_file, k) for (source, neighbors) in neighborhoods.items(): if graph.get(source, None) is None: graph[source] = {} for nbr in neighbors: graph[source][nbr] = graph[source].get(nbr, 0) + 1 log.tick() log.flushTracker() log.writeln(' >> Normalizing edge weights...') max_count = float(len(neighbor_files)) for (source, neighborhood) in graph.items(): for (nbr, freq) in neighborhood.items(): graph[source][nbr] = freq / max_count log.writeln('Graph complete!') return graph
def collapseMentionEmbeddings(f, keys, layer, mentions_by_id, action_oracle): log.track(message=' >> Processed {0}/{1:,} mentions'.format( '{0:,}', len(keys)), writeInterval=50) new_mentions = [] with h5py.File(f, 'r') as stream: for i in range(len(keys)): (m_id, mention_start, mention_end) = keys[i] mention_token_embeddings = stream[str(i)][...] if layer == AVERAGE_LAYERS: mention_token_embeddings = np.mean(mention_token_embeddings, axis=0) else: mention_token_embeddings = mention_token_embeddings[ layer, :, :] if action_oracle: mention_token_embeddings = mention_token_embeddings[ mention_start:mention_end] mention_embedding = np.mean(mention_token_embeddings, axis=0) old_mention = mentions_by_id[m_id] new_mentions.append( mention_file.EmbeddedMention(CUI=old_mention.CUI, mention_repr=None, context_repr=mention_embedding, candidates=old_mention.candidates, ID=old_mention.ID)) log.tick() log.flushTracker() return new_mentions
def streamingBERTConvert(bertf, overlaps, outf, tokenizedf): line_index = 0 overlap_index = 0 log.track(' >> Processed {0:,} lines of BERT output ({1:,}/%s text lines)' % ( '{0:,}'.format(len(overlaps)) )) line_embeddings_by_layer = {} with open(bertf, 'r') as bert_stream, \ h5py.File(outf, 'w') as h5stream, \ open(tokenizedf, 'w') as token_stream: for line in bert_stream: data = json.loads(line) all_tokens = data['features'] # blank lines in the input still get entered in JSON; # skip output from those lines to maintain proper alignment if len(all_tokens) > 0: for i in range(len(all_tokens)): token_stream.write(all_tokens[i]['token']) if i < len(all_tokens)-1: token_stream.write(' ') else: token_stream.write('\n') overlap_quantity = overlaps[line_index][overlap_index] for token_embedding in all_tokens[:len(all_tokens)-overlap_quantity]: for embedding_layer in token_embedding['layers']: layer_ix = embedding_layer['index'] if not layer_ix in line_embeddings_by_layer: line_embeddings_by_layer[layer_ix] = [] line_embeddings_by_layer[layer_ix].append(embedding_layer['values']) if overlap_quantity == 0: # hit end of line, so construct numpy tensor as # [ <layer>, <token_ix>, <values> ] line_tensor = np.array([ layer_token_values for (layer_ix, layer_token_values) in sorted( line_embeddings_by_layer.items(), key=lambda pair: pair[0] ) ]) h5stream.create_dataset( str(line_index), data=line_tensor ) line_index += 1 overlap_index = 0 line_embeddings_by_layer = {} else: overlap_index += 1 log.tick(line_index+1) log.flushTracker(line_index)
def _nn_writer(neighborf, node_IDs, nn_q): stream = open(neighborf, 'w') stream.write('# File format is:\n# <word vocab index>,<NN 1>,<NN 2>,...\n') result = nn_q.get() log.track(message=' >> Processed {0}/{1:,} samples'.format('{0:,}', len(node_IDs)), writeInterval=50) while result != _SIGNALS.HALT: (ix, neighbors) = result stream.write('%s\n' % ','.join([ str(d) for d in [ node_IDs[ix], *[ node_IDs[nbr] for nbr in neighbors ] ] ])) log.tick() result = nn_q.get() log.flushTracker()
def _writeAnnotations(outfile, annot_q): '''(Multithreaded) Writes annotations to a file, one per line. Annotation format is: <offset> <# of tokens> <term ID> Where offset is the number of tokens from the start of the immediately preceding annotation. (Reduces file size) ''' previous_start_ix = 0 with codecs.open(outfile, 'w', 'utf-8') as stream: write_queue, write_ix, halting = {}, 0, False log.track(message=' >> Processed {0:,} lines', writeInterval=100) while True: packet = annot_q.get() if packet == _SIGNALS.HALT: halting = True if not halting: # store the next tagged line in the priority queue, indexed by position (line_ix, annotations) = packet write_queue[line_ix] = annotations # check if the next position to write has been queued yet while not write_queue.get(write_ix, None) is None: annotations = write_queue.pop(write_ix) for (start_ix, num_tokens, term_id) in annotations: offset = start_ix - previous_start_ix stream.write('%d %d %s\n' % (offset, num_tokens, term_id)) previous_start_ix = start_ix write_ix += 1 log.tick() # make sure that we've cleared out all of the write queue if halting: if len(write_queue) > 0: print(write_queue) raise Exception( "Line missing: ordered write queue is not empty!") break log.flushTracker()
def filterMentions(preprocessed, options): filtered_mentions, skipped = [], 0 log.track(message=' >> Processed {0:,}/%s mentions' % '{0:,}'.format(len(preprocessed.mentions)), writeInterval=100) for m in preprocessed.mentions: valid = True (feature_vector, label) = prepSample(m, preprocessed, preprocessed.global_unigram_features, options) # check to ensure we have any features for this point if feature_vector is None: valid = False skipped += 1 if valid: filtered_mentions.append(m) log.tick() log.flushTracker() return filtered_mentions, skipped
def readTerminology(terminology_f, tokenizer, remove_stopwords=False, use_collapsed_string=False): if remove_stopwords: stopwords = set(nltk.corpus.stopwords.words('english')) else: stopwords = set() hook = codecs.open(terminology_f, 'r', 'utf-8') # initialize tag->[cuis] and ngram maps entities_by_term = {} mapper = NGramMapper() hook.readline() # ignore header line log.track(message=' >> Processed {0:,} lines', writeInterval=1000) for line in hook: (entity_ID, term) = line.split('\t') term = tokenizer.tokenize(term) entity_ID = entity_ID.strip() # if the string is in the set of stopwords to ignore, ignore it if ' '.join(term) in stopwords: continue # add string to ngram map term_ID = mapper.add(term, use_collapsed_string=use_collapsed_string) # add CUI to this tag ID entities = entities_by_term.get(term_ID, []) entities.append(entity_ID) entities_by_term[term_ID] = list(set(entities)) # remove duplicates log.tick() log.flushTracker() hook.close() return mapper.ngrams, entities_by_term
def validate(corpus_f, annot_f, term_map): log.track(' >> Validated {0:,} tokens ({1:,} annotations)', writeInterval=100000) annots_so_far = 0 with open(corpus_f, 'r') as corpus_stream, \ open(annot_f, 'r') as annot_stream: annot_buffer = AnnotationBuffer(annot_stream, term_map) annot_buffer.refill() token_buffer = TokenBuffer(corpus_stream) since_last_term = -1 while token_buffer.active_index < len(token_buffer.contents): t = token_buffer.contents[token_buffer.active_index] if len(annot_buffer) == 0: break # start watching any new terms beginning with this token since_last_term += 1 while ((annot_buffer.active_up_to < len(annot_buffer)) and (since_last_term == annot_buffer[annot_buffer.active_up_to].offset)): annot_buffer.active_up_to += 1 since_last_term = 0 #print(token_buffer) #print(annot_buffer) #input() # for all watched terms, validate that the current word is as expected for i in range(annot_buffer.active_up_to): annot = annot_buffer[i] if annot.num_tokens != len(term_map[annot.term_id]): raise KeyError( 'Expected term {0} to have {1:,} tokens, annotation has {2:,}' .format(annot.term_id, len(term_map[annot.term_id]), annot.num_tokens)) expected_token = term_map[annot.term_id][annot.seen_so_far] if expected_token != t: raise ValueError( 'Expected token "{0}" for term {1} at position {2}, found "{3}"' .format(expected_token, annot.term_id, annot.seen_so_far, t)) annot.seen_so_far += 1 # go back through the watched terms and unwatch any that have been completed i = 0 while i < annot_buffer.active_up_to: if annot_buffer[i].seen_so_far == annot_buffer[i].num_tokens: annot_buffer.pop(i) annot_buffer.active_up_to -= 1 annots_so_far += 1 else: i += 1 annot_buffer.refill() token_buffer.shift() log.tick(annots_so_far) log.flushTracker(annots_so_far)
def extractAllEntities(data_directories, log=log, with_full_text=False, errors='strict', by_document=False, polarity_type=int): ''' Extract all Mobility, Action, Assistance, and Quantification entities from XML-formatted annotation files. @parameters data_directories :: list of directories containing .xml annotation files with_full_text :: includes full document text in "full_text" field of each object log :: logging object to write to (defaults to dng_logger.log) @returns mobilities :: list of Mobility objects actions :: list of Action objects assistances :: list of Assistance objects quantifications :: list of Quantification objects ''' mobilities = [] actions = [] assistances = [] quantifications = [] documents = [] extractor = XMLEntityExtractor() for dir_path in data_directories: files = os.listdir(dir_path) log.writeln('Extracting data from %s...' % dir_path) log.track( message= ' >> Extracted entities from {0:,}/{1:,} files ({2:,} entities)', writeInterval=1) for f in files: fpath = os.path.join(dir_path, f) doc = extractor.extractMentions(fpath, with_full_text=with_full_text, errors=errors, polarity_type=polarity_type, as_document=True) doc.file_path = fpath doc.ID = f for m in doc.mobilities: m.file_ID = f mobilities.append(m) for m in doc.actions: m.file_ID = f actions.append(m) for m in doc.assistances: m.file_ID = f assistances.append(m) for m in doc.quantifications: m.file_ID = f quantifications.append(m) documents.append(doc) log.tick( len(files), len(mobilities) + len(actions) + len(assistances) + len(quantifications)) log.flushTracker( len(files), len(mobilities) + len(actions) + len(assistances) + len(quantifications)) if by_document: return documents else: return (mobilities, actions, assistances, quantifications)
def extractAllEntities(data_directories, text_directory, csv_id_pattern, txt_sub_pattern, log=log, with_full_text=False, by_document=True): ''' Extract all Mobility, Action, Assistance, and Quantification entities from CSV-formatted annotation files. @parameters data_directories :: list of directories containing .csv annotation files text_directory :: directory containing reference .txt files csv_id_pattern :: Python regex pattern for extracting file ID from a CSV file name (as first group); e.g. 'myfile_([0-9]*).csv' will extract '12345' as file ID from file myfile_12345.csv txt_sub_pattern :: Python string formatting pattern for matching to reference text files (ID substituted for {0}); e.g., 'mytext_{0}.txt' will look for mytext_12345.txt for file ID 12345 log :: logging object to write to (defaults to dng_logger.log) @returns mobilities :: list of Mobility objects actions :: list of Action objects assistances :: list of Assistance objects quantifications :: list of Quantification objects ''' documents = [] mobilities = [] actions = [] assistances = [] quantifications = [] paired_files = matchAnnotationAndTextFiles(data_directories, text_directory, csv_id_pattern, txt_sub_pattern, log=log) log.track(message=' >> Extracted entities from {0:,}/{1:,} files ({2:,} entities)', writeInterval=1) for (_id, (csvf, txtf)) in paired_files.items(): doc = extractAnnotationsFromFile( csvf, txtf, as_document=True ) doc.file_path = txtf doc.ID = _id for m in doc.mobilities: m.file_ID = _id mobilities.append(m) for m in doc.actions: m.file_ID = _id actions.append(m) for m in doc.assistances: m.file_ID = _id assistances.append(m) for m in doc.quantifications: m.file_ID = _id quantifications.append(m) documents.append(doc) log.tick(len(paired_files), len(mobilities) + len(actions) + len(assistances) + len(quantifications)) log.flushTracker(len(paired_files), len(mobilities) + len(actions) + len(assistances) + len(quantifications)) if by_document: return documents else: return ( mobilities, actions, assistances, quantifications )