示例#1
0
 def decode(self, sentence_tokens):
     tag_indexer = self.tag_ix
     feature_indexer = self.feature_ix
     all_features = []
     for word_idx in range(0, len(sentence_tokens)):
         features = []
         for tag_idx in range(0, len(tag_indexer)):
             features.append(
                 extract_emission_features(sentence_tokens,
                                           word_idx,
                                           tag_indexer.get_object(tag_idx),
                                           feature_indexer,
                                           add_to_indexer=False))
         all_features.append(features)
     all_features = np.array(all_features)
     score, seq = self.forward(all_features)
     seq = flatten(seq)
     pred_tags = []
     for j in seq:
         pred_tags.append(self.tag_ix.ints_to_objs[j])
     return LabeledSentence(sentence_tokens,
                            chunks_from_bio_tag_seq(pred_tags))
示例#2
0
def train_model_based_binary_ner(ner_exs: List[PersonExample]):
    shuffle(ner_exs)
    """
    =======================================
    ========== Build Indexers =============
    """
    word_ix, pos_ix = create_index(ner_exs=ner_exs, stops=stops)
    ix2embedding = load_word_embedding(pretrained_embedding_filename=config.glove_file,
                                       word2index_vocab=word_ix.objs_to_ints)
    train_sent, POS, train_lables = index_data(ner_exs, word_ix, pos_ix)

    epochs = config.epochs
    batch_size = config.batch_size
    initial_lr = config.initial_lr
    no_of_classes = config.no_of_classes

    """
    ==================================
    =====  Network Definition ========
    ==================================
    """
    word_indicator_feat_dim = len(word_ix)
    pos_indicator_feat_dim = len(pos_ix)
    is_upper_feat_dim = 1
    all_caps_indicator_feat_dim = 1

    word_embedding_feat_dim = 300
    context_window_1 = 300
    context_window_2 = 300
    context_left_1 = 300
    context_left_2 = 300
    context_right_1 = 300

    feat_dim = 0

    # feat_dim += word_indicator_feat_dim
    # feat_dim += pos_indicator_feat_dim
    # feat_dim += is_upper_feat_dim
    # feat_dim += all_caps_indicator_feat_dim
    #
    feat_dim += word_embedding_feat_dim
    # feat_dim += context_window_1
    # feat_dim += context_window_2
    # feat_dim += context_left_1
    # # feat_dim += context_left_2
    # feat_dim += context_right_1

    n_input_dim = feat_dim
    n_hidden1 = 16  # Number of hidden nodes
    n_hidden2 = 8
    n_output = 2  # Number of output nodes = for binary classifier

    net = nn.Sequential(
        nn.Linear(n_input_dim, n_hidden1),
        nn.ELU(),
        nn.Linear(n_hidden1, n_hidden2),
        nn.ELU(),
        nn.Linear(n_hidden2, n_output),
        nn.Sigmoid())
    print(net)

    learning_rate = initial_lr

    for epoch in range(epochs):
        t = time.time()
        """
        ================= Create batch ===============
        """
        for i in range(0, len(train_sent), batch_size):
            if len(train_sent[i:]) <= batch_size:
                data_batch = train_sent[i:]
                pos_batch = POS[i:]
            else:
                data_batch = train_sent[i: i + batch_size]
                pos_batch = POS[i: i + batch_size]

            Y_train = flatten(train_lables[i: i + batch_size])

            """
            ========== scaling ================== 
            """
            # compute class weights
            if Y_train.count(1) == 0:
                scaling = 1
            else:
                scaling = Y_train.count(0) / Y_train.count(1)

            pos_weight = torch.ones([no_of_classes])
            pos_weight[1] = scaling

            optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

            loss_func = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
            # loss_func = nn.BCELoss()

            Y_train = np.asarray(Y_train)
            X_train = []
            for sent, pos in zip(data_batch, pos_batch):
                for idx in range(0, len(sent)):
                    X_train.append(get_features(sent, pos, word_ix, pos_ix, ix2embedding, idx))
            X_train = np.asarray(X_train)

            # One hot
            y_train_one_hot = np.zeros((Y_train.size, no_of_classes))
            for ix, n in enumerate(Y_train):
                if n == 0:
                    y_train_one_hot[ix, 0] = 1
                else:
                    y_train_one_hot[ix, 1] = 1
            Y_train = y_train_one_hot

            # convert to tensor
            X_train_t = torch.FloatTensor(X_train)
            Y_train_t = torch.FloatTensor(Y_train)

            y_hat = net(X_train_t)

            loss = loss_func(y_hat, Y_train_t)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        print("Epoch : ", epoch)
        print("Time taken", time.time() - t)
        print("Learning Rate = ", learning_rate)
        if (epoch + 1) % 3 == 0:
            learning_rate = initial_lr*2

        learning_rate = learning_rate / 2
        print("----------")
        print(" ")

    return BinaryPersonClassifier(model=net,
                                  word_ix=word_ix,
                                  pos_ix=pos_ix,
                                  ix2embed=ix2embedding)
示例#3
0
def main(docs, labels, supervision_rate):

    # create dictionaries {label: label_id} and {label_id: label}
    label_to_id_dict = {v: n for n, v in enumerate(all_labels)}
    id_to_label_dict = {v: k for k, v in label_to_id_dict.items()}

    # Filter out docs that are less than 250 characters
    docs_labels = filter(lambda x: len(x[0]) > 250, zip(docs, labels))
    docs = zip(*docs_labels)[0]
    labels = zip(*docs_labels)[1]

    # need to transform 'labels' from strings to indices
    label_ids = [[label_to_id_dict[label] for label in label_list]
                 for label_list in labels]

    # subset list of doc labels at given rate and exclude example docs
    # example doc ids
    doc_ids = [29, 38, 13, 28, 41]
    label_ids_sub = [
        label_id_list if (random.uniform(0, 1) < supervision_rate) and
        (doc_id not in doc_ids) else []
        for doc_id, label_id_list in enumerate(label_ids)
    ]

    # calculate how many topics are captured
    known_topic_counter = Counter()
    for label_id in flatten(label_ids_sub):
        known_topic_counter[label_id] += 1

    topic_coverage = float(len(known_topic_counter)) / len(
        all_labels)  # percentage of topics covered
    print 'Topic coverage: %s' % topic_coverage

    # Vectorize
    tfidf_vectorizer = tfidf.Vectorizer(vocab_size=2000)
    tfidf_vectorizer.fit(docs)
    doc_term_matrix, terms = tfidf_vectorizer.transform(docs)

    # Factorize (weakly supervised)
    ws_nmf_model = ws_nmf.Model(doc_term_matrix,
                                label_ids_sub,
                                K=len(all_labels))
    ws_nmf_model.train(max_iter=30)
    doc_topic_matrix_ws = ws_nmf_model.W
    topic_term_matrix_ws = ws_nmf_model.H

    # Create useful dictionaries
    # {topic id: terms}
    topic_to_term_dict_ws = create_topic_to_term_dict(topic_term_matrix_ws,
                                                      terms)
    # {doc id: [(topic id, strength)]}
    doc_to_topic_ws = create_doc_to_topic_dict(
        doc_topic_matrix_ws,
        cutoff=0.001)  # higher cutoff reduces dictionary size
    # {doc id: [(label id, strength)]}
    doc_to_label = defaultdict(list)
    for doc_ind, label_list in enumerate(label_ids):
        for label in label_list:
            doc_to_label[doc_ind].append((label, 1))

    # Compute topic to label similarity matrix
    similarity_ws = compute_similarity_matrix(doc_to_topic_ws, doc_to_label)

    # Run hungarian algorithm
    score_ws, sorted_matches_ws, matched_similarity_ws = match_similarity_matrix(
        similarity_ws)

    # Print assignment score
    print 'Average similarity: %s' % score_ws

    # Print top 50 matched assignment
    matched_topic_terms_ws = [
        (round(score, 3), id_to_label_dict[label_ind],
         topic_to_term_dict_ws[topic_ind])
        for score, topic_ind, label_ind in sorted_matches_ws
    ]
    pprint.pprint(matched_topic_terms_ws[:50])

    # Determine number of "resolved" topics (similarity > 0.1)
    n_resolved = len([
        score for score, topic_ind, label_ind in sorted_matches_ws
        if score > 0.1
    ])

    # Print number of toipcs resolved
    print 'Numer of topics resolved: %s' % n_resolved

    # Print examples of documents
    print_examples(doc_ids, docs, doc_to_label, doc_to_topic_ws,
                   topic_to_term_dict_ws, id_to_label_dict)

    return topic_coverage, score_ws, n_resolved
示例#4
0
def train_mlp_ner(train_data: List[LabeledSentence], dev_data, test_data):
    shuffle(train_data)
    """
    =======================================
    ========== Build Indexers =============
    """
    tag_ix = Indexer()
    word_ix = Indexer()
    pos_ix = Indexer()
    word_counter = Counter()

    tag_ix.add_and_get_index(conf.PAD_TOKEN)   # padding
    word_ix.add_and_get_index(conf.PAD_TOKEN)
    tag_ix.add_and_get_index(conf.EOS_TOKEN)   # End of Sentence
    word_ix.add_and_get_index(conf.EOS_TOKEN)
    tag_ix.add_and_get_index(conf.BOS_TOKEN)   # Beginning of Sentence
    word_ix.add_and_get_index(conf.BOS_TOKEN)
    tag_ix.add_and_get_index(conf.UNK_TOKEN)   # Unk Words
    word_ix.add_and_get_index(conf.UNK_TOKEN)

    for sentence in train_data:
        for token in sentence.tokens:
            word_counter[token.word] += 1.0

    for sentence in train_data:
        for token in sentence.tokens:
            # If the word occurs fewer than two times, don't index it -- we'll treat it as UNK
            get_word_index(word_indexer=word_ix, word_counter=word_counter, stops=stops, word=token.word.lower(), th=0)
            pos_ix.add_and_get_index(token.pos)
        for tag in sentence.get_bio_tags():
            tag_ix.add_and_get_index(tag)

    ix2embedding = load_word_embedding(pretrained_embedding_filename=conf.glove_file,
                                       word2index_vocab=word_ix.objs_to_ints)
    train_sent = []
    POS = []
    train_labels = []

    for sentence in train_data:
        s = []
        pos = []
        labels = []
        for token in sentence.tokens:
            if token.word.lower() in word_ix.objs_to_ints:
                s.append(word_ix.objs_to_ints[token.word.lower()])
            else:
                s.append(word_ix.objs_to_ints[conf.UNK_TOKEN])

            if token.pos in pos_ix.objs_to_ints:
                pos.append(pos_ix.objs_to_ints[token.pos])
            else:
                pos.append(pos_ix.objs_to_ints[conf.UNK_TOKEN])

        for tag in sentence.get_bio_tags():
            labels.append(tag_ix.objs_to_ints[tag])
        train_sent.append(s)
        POS.append(pos)
        train_labels.append(labels)

    epochs = conf.epochs
    batch_size = conf.batch_size
    initial_lr = conf.initial_lr
    no_of_classes = len(tag_ix)

    """
    ==================================
    =====  Network Definition ========
    ==================================
    """
    word_indicator_feat_dim = len(word_ix)
    pos_indicator_feat_dim = len(pos_ix)
    is_upper_feat_dim = 1
    all_caps_indicator_feat_dim = 1

    word_embedding_feat_dim = 300
    context_window_1 = 300
    context_window_2 = 300
    context_left_1 = 300
    context_left_2 = 300
    context_right_1 = 300

    feat_dim = 0

    # feat_dim += word_indicator_feat_dim
    feat_dim += pos_indicator_feat_dim
    feat_dim += is_upper_feat_dim
    feat_dim += all_caps_indicator_feat_dim
    #
    feat_dim += word_embedding_feat_dim
    feat_dim += context_window_1
    # feat_dim += context_window_2
    # feat_dim += context_left_1
    # # feat_dim += context_left_2
    # feat_dim += context_right_1

    n_input_dim = feat_dim
    n_hidden1 = 64  # Number of hidden nodes
    n_hidden2 = 32
    n_hidden3 = 16
    n_output = no_of_classes  # Number of output nodes = for binary classifier

    net = nn.Sequential(
        nn.Linear(n_input_dim, n_hidden1),
        nn.ELU(),
        nn.Dropout(0.2),
        nn.Linear(n_hidden1, n_hidden2),
        nn.ELU(),
        nn.Dropout(0.2),
        nn.Linear(n_hidden2, n_hidden3),
        nn.ELU(),
        nn.Linear(n_hidden3, n_output),
        nn.Sigmoid())
    print(net)

    learning_rate = initial_lr

    best_f1 = 0
    for epoch in range(epochs):
        t = time.time()
        """
        ================= Create batch ===============
        """
        for i in range(0, len(train_sent), batch_size):
            if len(train_sent[i:]) <= batch_size:
                data_batch = train_sent[i:]
                pos_batch = POS[i:]
            else:
                data_batch = train_sent[i: i + batch_size]
                pos_batch = POS[i: i + batch_size]

            Y_train = flatten(train_labels[i: i + batch_size])
            optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
            loss_func = nn.BCELoss()
            Y_train = np.asarray(Y_train)
            X_train = []
            for sent, pos in zip(data_batch, pos_batch):
                for idx in range(0, len(sent)):
                    X_train.append(get_features(sent, pos, word_ix, pos_ix, ix2embedding, idx))
            X_train = np.asarray(X_train)

            # One hot
            y_train_one_hot = np.zeros((Y_train.size, no_of_classes))

            for ix, n in enumerate(Y_train):
                y_train_one_hot[ix, n] = 1
            Y_train = y_train_one_hot

            # convert to tensor
            X_train_t = torch.FloatTensor(X_train)
            Y_train_t = torch.FloatTensor(Y_train)

            y_hat = net(X_train_t)

            loss = loss_func(y_hat, Y_train_t)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        model = MLPNerClassifier(model=net,
                                 word_ix=word_ix,
                                 pos_ix=pos_ix,
                                 tag_ix=tag_ix,
                                 ix2embed=ix2embedding)
        # Compute Dev Acc.
        # if (epoch + 1) % 3 == 0:
        dev_decoded = [model.decode(test_ex.tokens) for test_ex in dev_data]
        f1 = print_evaluation_metric(dev_data, dev_decoded)
        if f1 > best_f1:
            test_decoded = [model.decode(test_ex.tokens) for test_ex in test_data]
            write_test_output(test_decoded, conf.output_path)

        print("-------------------------")
        print("Epoch: ", epoch)
        print("Time taken: ", time.time() - t)
        print(" ")
        print(" -------------------------")
        print("----------")
        print(" ")

    return model
示例#5
0
def import_pubmed_data(pubmed_filename):
    print 'importing raw data...'
    e = ET.parse(pubmed_filename).getroot()

    # Initialize data dictionaries
    pubmed_dicts = []

    print 'parsing XML..'
    for article in e.findall('PubmedArticle'):

        # Get article ID
        article_id = article.find(".//ArticleId[@IdType='pubmed']").text

        # Get abstract text (pass if no text)
        find_abstracts = article.findall(".//AbstractText")
        if len(find_abstracts) > 0:
            cur_abstract = ' '.join(
                [abstract.text for abstract in find_abstracts])
        else:
            continue

        # Get keywords (pass if no keywords)
        find_keywords = article.findall(".//MeshHeading/DescriptorName")
        if len(find_keywords) > 0:
            cur_keywords = [keyword.text for keyword in find_keywords]
        else:
            continue

        pubmed_dicts.append({
            'article_id': article_id,
            'abstract': cur_abstract,
            'keywords': cur_keywords
        })

    print 'filtering infrequent keywords...'
    # Create counter with keywords
    keywords_counter = Counter()
    for kw in flatten([d['keywords'] for d in pubmed_dicts]):
        keywords_counter[kw] += 1

    # Filter out keywords that occur less than 100 times
    keywords_set = [
        kw for kw in keywords_counter.keys() if keywords_counter[kw] >= 100
    ]

    for d in pubmed_dicts:
        d['keywords'] = filter(lambda x: x in keywords_set, d['keywords'])

    # Now filter out documents that have no remaining keywords
    pubmed_dicts = filter(lambda x: len(x['keywords']) > 0, pubmed_dicts)

    print 'N unique keywords: %s' % len(keywords_set)
    print 'N docs: %s' % len(pubmed_dicts)

    # Create list of docs
    docs = [d['abstract'] for d in pubmed_dicts]

    # Create list of labels lists
    labels = [d['keywords'] for d in pubmed_dicts]

    # Create list of all unique labels
    all_labels = list(set(flatten(labels)))

    return docs, labels, all_labels