def buildGraph(neighbor_files, k):
    log.writeln('Building neighborhood graph...')
    graph = {}

    # construct frequency-weighted edges
    log.track(message='  >> Loaded {0}/%d neighborhood files' %
              len(neighbor_files),
              writeInterval=1)
    for neighbor_file in neighbor_files:
        neighborhoods = readNeighbors(neighbor_file, k)
        for (source, neighbors) in neighborhoods.items():
            if graph.get(source, None) is None:
                graph[source] = {}
            for nbr in neighbors:
                graph[source][nbr] = graph[source].get(nbr, 0) + 1
        log.tick()
    log.flushTracker()

    log.writeln('  >> Normalizing edge weights...')
    max_count = float(len(neighbor_files))
    for (source, neighborhood) in graph.items():
        for (nbr, freq) in neighborhood.items():
            graph[source][nbr] = freq / max_count

    log.writeln('Graph complete!')
    return graph
示例#2
0
def collapseMentionEmbeddings(f, keys, layer, mentions_by_id, action_oracle):
    log.track(message='  >> Processed {0}/{1:,} mentions'.format(
        '{0:,}', len(keys)),
              writeInterval=50)
    new_mentions = []

    with h5py.File(f, 'r') as stream:
        for i in range(len(keys)):
            (m_id, mention_start, mention_end) = keys[i]
            mention_token_embeddings = stream[str(i)][...]

            if layer == AVERAGE_LAYERS:
                mention_token_embeddings = np.mean(mention_token_embeddings,
                                                   axis=0)
            else:
                mention_token_embeddings = mention_token_embeddings[
                    layer, :, :]

            if action_oracle:
                mention_token_embeddings = mention_token_embeddings[
                    mention_start:mention_end]
            mention_embedding = np.mean(mention_token_embeddings, axis=0)

            old_mention = mentions_by_id[m_id]
            new_mentions.append(
                mention_file.EmbeddedMention(CUI=old_mention.CUI,
                                             mention_repr=None,
                                             context_repr=mention_embedding,
                                             candidates=old_mention.candidates,
                                             ID=old_mention.ID))

            log.tick()
    log.flushTracker()

    return new_mentions
def streamingBERTConvert(bertf, overlaps, outf, tokenizedf):
    line_index = 0
    overlap_index = 0

    log.track('  >> Processed {0:,} lines of BERT output ({1:,}/%s text lines)' % (
        '{0:,}'.format(len(overlaps))
    ))
    line_embeddings_by_layer = {}
    with open(bertf, 'r') as bert_stream, \
         h5py.File(outf, 'w') as h5stream, \
         open(tokenizedf, 'w') as token_stream:
        for line in bert_stream:
            data = json.loads(line)
            all_tokens = data['features']
            # blank lines in the input still get entered in JSON;
            # skip output from those lines to maintain proper alignment
            if len(all_tokens) > 0:
                for i in range(len(all_tokens)):
                    token_stream.write(all_tokens[i]['token'])
                    if i < len(all_tokens)-1:
                        token_stream.write(' ')
                    else:
                        token_stream.write('\n')

                overlap_quantity = overlaps[line_index][overlap_index]
                for token_embedding in all_tokens[:len(all_tokens)-overlap_quantity]:
                    for embedding_layer in token_embedding['layers']:
                        layer_ix = embedding_layer['index']
                        if not layer_ix in line_embeddings_by_layer:
                            line_embeddings_by_layer[layer_ix] = []
                        line_embeddings_by_layer[layer_ix].append(embedding_layer['values'])

                if overlap_quantity == 0:
                    # hit end of line, so construct numpy tensor as
                    # [ <layer>, <token_ix>, <values> ]
                    line_tensor = np.array([
                        layer_token_values
                            for (layer_ix, layer_token_values)
                            in sorted(
                                line_embeddings_by_layer.items(),
                                key=lambda pair: pair[0]
                            )
                    ])
                    h5stream.create_dataset(
                        str(line_index),
                        data=line_tensor
                    )
                    line_index += 1
                    overlap_index = 0
                    line_embeddings_by_layer = {}

                else:
                    overlap_index += 1

            log.tick(line_index+1)
    log.flushTracker(line_index)
示例#4
0
def _nn_writer(neighborf, node_IDs, nn_q):
    stream = open(neighborf, 'w')
    stream.write('# File format is:\n# <word vocab index>,<NN 1>,<NN 2>,...\n')
    result = nn_q.get()
    log.track(message='  >> Processed {0}/{1:,} samples'.format('{0:,}', len(node_IDs)), writeInterval=50)
    while result != _SIGNALS.HALT:
        (ix, neighbors) = result
        stream.write('%s\n' % ','.join([
            str(d) for d in [
                node_IDs[ix], *[
                    node_IDs[nbr]
                        for nbr in neighbors
                ]
            ]
        ]))
        log.tick()
        result = nn_q.get() 
    log.flushTracker()
示例#5
0
def _writeAnnotations(outfile, annot_q):
    '''(Multithreaded) Writes annotations to a file, one per line.

    Annotation format is:
       <offset> <# of tokens> <term ID>

    Where offset is the number of tokens from the start of the immediately
    preceding annotation.  (Reduces file size)
    '''
    previous_start_ix = 0
    with codecs.open(outfile, 'w', 'utf-8') as stream:
        write_queue, write_ix, halting = {}, 0, False
        log.track(message='  >> Processed {0:,} lines', writeInterval=100)
        while True:
            packet = annot_q.get()
            if packet == _SIGNALS.HALT: halting = True

            if not halting:
                # store the next tagged line in the priority queue, indexed by position
                (line_ix, annotations) = packet
                write_queue[line_ix] = annotations

            # check if the next position to write has been queued yet
            while not write_queue.get(write_ix, None) is None:
                annotations = write_queue.pop(write_ix)
                for (start_ix, num_tokens, term_id) in annotations:
                    offset = start_ix - previous_start_ix
                    stream.write('%d %d %s\n' % (offset, num_tokens, term_id))
                    previous_start_ix = start_ix
                write_ix += 1
                log.tick()

            # make sure that we've cleared out all of the write queue
            if halting:
                if len(write_queue) > 0:
                    print(write_queue)
                    raise Exception(
                        "Line missing: ordered write queue is not empty!")
                break

    log.flushTracker()
def filterMentions(preprocessed, options):
    filtered_mentions, skipped = [], 0
    log.track(message='  >> Processed {0:,}/%s mentions' %
              '{0:,}'.format(len(preprocessed.mentions)),
              writeInterval=100)
    for m in preprocessed.mentions:
        valid = True
        (feature_vector,
         label) = prepSample(m, preprocessed,
                             preprocessed.global_unigram_features, options)
        # check to ensure we have any features for this point
        if feature_vector is None:
            valid = False
            skipped += 1

        if valid:
            filtered_mentions.append(m)
        log.tick()
    log.flushTracker()

    return filtered_mentions, skipped
示例#7
0
def readTerminology(terminology_f,
                    tokenizer,
                    remove_stopwords=False,
                    use_collapsed_string=False):
    if remove_stopwords:
        stopwords = set(nltk.corpus.stopwords.words('english'))
    else:
        stopwords = set()

    hook = codecs.open(terminology_f, 'r', 'utf-8')

    # initialize tag->[cuis] and ngram maps
    entities_by_term = {}
    mapper = NGramMapper()

    hook.readline()  # ignore header line
    log.track(message='  >> Processed {0:,} lines', writeInterval=1000)
    for line in hook:
        (entity_ID, term) = line.split('\t')
        term = tokenizer.tokenize(term)
        entity_ID = entity_ID.strip()
        # if the string is in the set of stopwords to ignore, ignore it
        if ' '.join(term) in stopwords:
            continue

        # add string to ngram map
        term_ID = mapper.add(term, use_collapsed_string=use_collapsed_string)
        # add CUI to this tag ID
        entities = entities_by_term.get(term_ID, [])
        entities.append(entity_ID)
        entities_by_term[term_ID] = list(set(entities))  # remove duplicates

        log.tick()
    log.flushTracker()

    hook.close()

    return mapper.ngrams, entities_by_term
示例#8
0
def validate(corpus_f, annot_f, term_map):
    log.track('  >> Validated {0:,} tokens ({1:,} annotations)',
              writeInterval=100000)
    annots_so_far = 0
    with open(corpus_f, 'r') as corpus_stream, \
         open(annot_f, 'r') as annot_stream:

        annot_buffer = AnnotationBuffer(annot_stream, term_map)
        annot_buffer.refill()

        token_buffer = TokenBuffer(corpus_stream)

        since_last_term = -1
        while token_buffer.active_index < len(token_buffer.contents):
            t = token_buffer.contents[token_buffer.active_index]

            if len(annot_buffer) == 0:
                break

            # start watching any new terms beginning with this token
            since_last_term += 1
            while ((annot_buffer.active_up_to < len(annot_buffer))
                   and (since_last_term
                        == annot_buffer[annot_buffer.active_up_to].offset)):
                annot_buffer.active_up_to += 1
                since_last_term = 0

            #print(token_buffer)
            #print(annot_buffer)
            #input()

            # for all watched terms, validate that the current word is as expected
            for i in range(annot_buffer.active_up_to):
                annot = annot_buffer[i]
                if annot.num_tokens != len(term_map[annot.term_id]):
                    raise KeyError(
                        'Expected term {0} to have {1:,} tokens, annotation has {2:,}'
                        .format(annot.term_id, len(term_map[annot.term_id]),
                                annot.num_tokens))
                expected_token = term_map[annot.term_id][annot.seen_so_far]
                if expected_token != t:
                    raise ValueError(
                        'Expected token "{0}" for term {1} at position {2}, found "{3}"'
                        .format(expected_token, annot.term_id,
                                annot.seen_so_far, t))
                annot.seen_so_far += 1

            # go back through the watched terms and unwatch any that have been completed
            i = 0
            while i < annot_buffer.active_up_to:
                if annot_buffer[i].seen_so_far == annot_buffer[i].num_tokens:
                    annot_buffer.pop(i)
                    annot_buffer.active_up_to -= 1
                    annots_so_far += 1
                else:
                    i += 1

            annot_buffer.refill()
            token_buffer.shift()
            log.tick(annots_so_far)
    log.flushTracker(annots_so_far)
示例#9
0
def extractAllEntities(data_directories,
                       log=log,
                       with_full_text=False,
                       errors='strict',
                       by_document=False,
                       polarity_type=int):
    '''
    Extract all Mobility, Action, Assistance, and Quantification entities from
    XML-formatted annotation files.

    @parameters
      data_directories :: list of directories containing .xml annotation files
      with_full_text   :: includes full document text in "full_text" field of each object
      log              :: logging object to write to (defaults to dng_logger.log)

    @returns
      mobilities      :: list of Mobility objects
      actions         :: list of Action objects
      assistances     :: list of Assistance objects
      quantifications :: list of Quantification objects
    '''
    mobilities = []
    actions = []
    assistances = []
    quantifications = []

    documents = []

    extractor = XMLEntityExtractor()

    for dir_path in data_directories:
        files = os.listdir(dir_path)

        log.writeln('Extracting data from %s...' % dir_path)
        log.track(
            message=
            '  >> Extracted entities from {0:,}/{1:,} files ({2:,} entities)',
            writeInterval=1)

        for f in files:
            fpath = os.path.join(dir_path, f)
            doc = extractor.extractMentions(fpath,
                                            with_full_text=with_full_text,
                                            errors=errors,
                                            polarity_type=polarity_type,
                                            as_document=True)

            doc.file_path = fpath
            doc.ID = f

            for m in doc.mobilities:
                m.file_ID = f
                mobilities.append(m)
            for m in doc.actions:
                m.file_ID = f
                actions.append(m)
            for m in doc.assistances:
                m.file_ID = f
                assistances.append(m)
            for m in doc.quantifications:
                m.file_ID = f
                quantifications.append(m)

            documents.append(doc)

            log.tick(
                len(files),
                len(mobilities) + len(actions) + len(assistances) +
                len(quantifications))
        log.flushTracker(
            len(files),
            len(mobilities) + len(actions) + len(assistances) +
            len(quantifications))

    if by_document:
        return documents
    else:
        return (mobilities, actions, assistances, quantifications)
示例#10
0
def extractAllEntities(data_directories, text_directory, csv_id_pattern, txt_sub_pattern, log=log,
        with_full_text=False, by_document=True):
    '''
    Extract all Mobility, Action, Assistance, and Quantification entities from
    CSV-formatted annotation files.

    @parameters
      data_directories :: list of directories containing .csv annotation files
      text_directory   :: directory containing reference .txt files
      csv_id_pattern   :: Python regex pattern for extracting file ID from a CSV file name
                          (as first group); e.g. 'myfile_([0-9]*).csv' will extract
                          '12345' as file ID from file myfile_12345.csv
      txt_sub_pattern  :: Python string formatting pattern for matching to reference
                          text files (ID substituted for {0}); e.g., 'mytext_{0}.txt'
                          will look for mytext_12345.txt for file ID 12345
      log              :: logging object to write to (defaults to dng_logger.log)

    @returns
      mobilities      :: list of Mobility objects
      actions         :: list of Action objects
      assistances     :: list of Assistance objects
      quantifications :: list of Quantification objects
    '''
    documents = []

    mobilities = []
    actions = []
    assistances = []
    quantifications = []

    paired_files = matchAnnotationAndTextFiles(data_directories, text_directory, csv_id_pattern, txt_sub_pattern, log=log)

    log.track(message='  >> Extracted entities from {0:,}/{1:,} files ({2:,} entities)', writeInterval=1)
    for (_id, (csvf, txtf)) in paired_files.items():
        doc = extractAnnotationsFromFile(
            csvf, 
            txtf, 
            as_document=True
        )

        doc.file_path = txtf
        doc.ID = _id

        for m in doc.mobilities:
            m.file_ID = _id
            mobilities.append(m)
        for m in doc.actions:
            m.file_ID = _id
            actions.append(m)
        for m in doc.assistances:
            m.file_ID = _id
            assistances.append(m)
        for m in doc.quantifications:
            m.file_ID = _id
            quantifications.append(m)

        documents.append(doc)

        log.tick(len(paired_files), len(mobilities) + len(actions) + len(assistances) + len(quantifications))
    log.flushTracker(len(paired_files), len(mobilities) + len(actions) + len(assistances) + len(quantifications))

    if by_document:
        return documents
    else:
        return (
            mobilities,
            actions,
            assistances,
            quantifications
        )