Пример #1
0
def arora_mean_pairwise_sim(sample):
    """
    Returns the mean cosine similarity (w.r.t. Arora embeddings) of each
    (prompt sentence, story sentence) pair
    """

    # Init sentence embedder if necessary
    global arora_sentence_embedder
    if arora_sentence_embedder is None:
        print(
            "\nInitializing arora sent embedder for the pairwise_arora_cosine_similarity metric..."
        )
        arora_sentence_embedder = load_arora_sentence_embedder()

    # Get sentences
    prompt_sentences = util.get_sentences(sample, 'prompt')
    story_sentences = util.get_sentences(sample, 'story')

    # Get embeddings
    # prompt_embeddings should be a np array shape (num_prompt_sents, emb_len); similarly for story_embeddings
    prompt_embeddings = [
        arora_sentence_embedder.embed_sent(sent.split())
        for sent in prompt_sentences
    ]
    prompt_embeddings = np.array(
        [np.array(emb) for emb in prompt_embeddings if emb is not None])
    story_embeddings = [
        arora_sentence_embedder.embed_sent(sent.split())
        for sent in story_sentences
    ]
    story_embeddings = np.array(
        [np.array(emb) for emb in story_embeddings if emb is not None])

    # Get prompt/story similarities. Might both be None.
    prompt_story_table, mean_pairwise_sim = util.get_sims(
        prompt_embeddings, story_embeddings)

    # Compute story sent / prompt sim table.
    # Is np array shape (num_story_sents), or None, representing the similarity of each story sentence to the prompt
    storysent_prompt_table = np.mean(
        prompt_story_table, axis=0) if prompt_story_table is not None else None

    # Save the tables to cache
    sample.cache['arora_stats'] = {
        "prompt_story_table": prompt_story_table,
        "storysent_prompt_table": storysent_prompt_table,
    }

    return mean_pairwise_sim
Пример #2
0
def get_keywords(text):
    data = {}
    for i, sentence in enumerate(get_sentences(text), start=1):
        tagged_tokens = pos_tag(tokenize(sentence))
        for term in get_terms(chunk(tagged_tokens)):
            _keywords(term, i, data)
    keywords = [{
        'keyword': v['term_forms'][0],
        'count': v['count'],
        'locations': v['locations']
    } for k, v in data.items()]
    return sorted(keywords, key=itemgetter('count'), reverse=True)
Пример #3
0
def get_keywords(text):
    data = {}
    for i, sentence in enumerate(get_sentences(text), start=1):
        tagged_tokens = pos_tag(tokenize(sentence))
        for term in get_terms(chunk(tagged_tokens)):
            _keywords(term, i, data)
    keywords = [{
        'keyword': v['term_forms'][0],
        'count': v['count'],
        'locations': v['locations']
    } for k, v in data.items()]
    return sorted(keywords, key=itemgetter('count'), reverse=True)
Пример #4
0
            train_set_y.append(type_list.index(item.trigger_label))

    lines = all_text.split('\n')
    si=0
    for line in lines:
        words=line.split(' ')
        for word in words:
            if word in embd:
                train_set_x.append(np.hstack((embd[word],ae_sf[si][0])))
                #train_set_x.append(embd[word])
                train_set_y.append(19)
    return train_set_x, train_set_y


if __name__=="__main__":
    data,filepath=get_sentences()
    ae=AE(data,filepath)
    #ae.build()
    ae.load('LSTM_AE')
    train_set_x, train_set_y = load_dataset('./mlee/train',ae)
    with open('data/train_set_x', 'w') as f:
        pickle.dump(train_set_x, f)
        f.flush()
        f.close()
    with open('data/train_set_y', 'w') as f:
        pickle.dump(train_set_y, f)
        f.flush()
        f.close()
    valid_set_x, valid_set_y = load_dataset('./mlee/valid', ae)
    with open('data/valid_set_x', 'w') as f:
		pickle.dump(valid_set_x, f)
Пример #5
0
            train_set_y.append(type_list.index(item.trigger_label))

    lines = all_text.split('\n')
    si = 0
    for line in lines:
        words = line.split(' ')
        for word in words:
            if word in embd:
                # train_set_x.append(np.hstack((embd[word],ae_sf[si][0])))
                train_set_x.append(embd[word])
                train_set_y.append(19)
    return train_set_x, train_set_y


if __name__ == "__main__":
    data, filepath = get_sentences()

    train_set_x, train_set_y = load_dataset('./mlee/train')
    with open('data/train_set_x', 'w') as f:
        pickle.dump(train_set_x, f)
        f.flush()
        f.close()

    with open('data/train_set_y', 'w') as f:
        pickle.dump(train_set_y, f)
        f.flush()
        f.close()

    print '**********loading valid set*****************'
    valid_set_x, valid_set_y = load_dataset('./mlee/valid')
    with open('data/valid_set_x', 'w') as f:
Пример #6
0
def mean_sent_len(sample):
    """Returns average story sentence length (measured in words)"""
    sents = util.get_sentences(sample, 'story')
    lengths = [_num_words(s) for s in sents]
    return util.mean(lengths)