Exemplo n.º 1
0
def get():
    keywords = bottle.request.params.keywords.split()
    classifier_name = bottle.request.params.classifier

    results = indexer.search_annotation(fl_keyword_pairs=[
        ('sentence_txt_ja', [keywords]), ('name_s', [['sentence']])
    ],
                                        rows=1000)

    for r in results['response']['docs']:
        sent = datastore.get_annotation(r['doc_id_i'],
                                        'sentence')[r['anno_id_i']]
        tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'),
                              sent)

        if classifier_name == 'ml':
            features = mlclassifier.convert_into_features_using_vocab(
                [(r['doc_id_i'], sent, tokens)], vocab_ml)
            predicteds = mlclassifier.classify(features, model_ml)
        elif classifier_name == 'dl':
            features = dlclassifier.convert_into_features_using_vocab(
                [(r['doc_id_i'], sent, tokens)], vocab_dl)
            predicteds = dlclassifier.classify(features, model_dl)
        elif classifier_name == 'rule':
            features = ruleclassifier.convert_into_features_using_rules(
                [(r['doc_id_i'], sent, tokens)], rule)
            predicteds = ruleclassifier.classify(features, rule)

        r['predicted'] = int(predicteds[0])  # covert from numpy.int to int
        print(r['predicted'], r['sentence_txt_ja'])

    return json.dumps(results, ensure_ascii=False)
Exemplo n.º 2
0
 def _extend(chunk, chunk_tokens):
     for child in all_chunks:
         _, link = child['link']
         if link == -1:
             continue
         if all_chunks[link] != chunk:
             continue
         child_tokens = find_xs_in_y(tokens, child)
         if child_tokens[0]['POS'] == chunk_tokens[0]['POS']:
             return [child] + _extend(child, child_tokens)
     return []
Exemplo n.º 3
0
def create_language_model(doc_ids, N=3):
    sents = []
    for doc_id in doc_ids:
        all_tokens = datastore.get_annotation(doc_id, 'token')
        for sent in datastore.get_annotation(doc_id, 'sentence'):
            tokens = find_xs_in_y(all_tokens, sent)
            sents.append(['__BOS__'] + [token['lemma']
                                        for token in tokens] + ['__EOS__'])
    vocab = Vocabulary([word for sent in sents for word in sent])
    text_ngrams = [ngrams(sent, N) for sent in sents]
    lm = MLE(order=N, vocabulary=vocab)
    lm.fit(text_ngrams)
    return lm
Exemplo n.º 4
0
def find_child(parent, chunks_in_sent, tokens_in_sent, text, all_chunks,
               child_cond):
    for child in chunks_in_sent:
        _, link = child['link']
        if link == -1 or all_chunks[link] != parent:
            continue
        child_tokens = find_xs_in_y(tokens_in_sent, child)
        if text[child['begin']:child['end']] in child_cond.get('text', []):
            return child, child_tokens
        if child_tokens[-1]['POS'] in child_cond.get('pos1', []) and \
                child_tokens[-1]['lemma'] in child_cond.get('lemma1', []) and \
                child_tokens[-2]['POS'] not in child_cond.get('pos2_ng', []):
            return child, child_tokens
    return None, None
Exemplo n.º 5
0
def get():
    title = bottle.request.params.title.strip()
    keywords = bottle.request.params.keywords.split()

    results = indexer.search_annotation(fl_keyword_pairs=[
        ('title_txt_ja', [[title]]), ('sentence_txt_ja', [keywords]),
        ('name_s', [['sentence']])
    ],
                                        rows=1000)

    for r in results['response']['docs']:
        sent = datastore.get_annotation(r['doc_id_i'],
                                        'sentence')[r['anno_id_i']]
        tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'),
                              sent)

        features = sentimentclassifier.convert_into_features_using_vocab(
            [(r['doc_id_i'], sent, tokens)], vocab)
        predicteds = mlclassifier.classify(features, model)

        r['predicted'] = int(predicteds[0])  # covert from numpy.int to int
        print(r['predicted'], r['sentence_txt_ja'])

    return json.dumps(results, ensure_ascii=False)
Exemplo n.º 6
0
import sqlitedatastore as datastore
from annoutil import find_x_including_y, find_xs_in_y

if __name__ == '__main__':
    datastore.connect()
    anno_name = 'affiliation'

    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, fl=['content'])
        text = row['content']
        sentences = datastore.get_annotation(doc_id, 'sentence')
        tokens = datastore.get_annotation(doc_id, 'token')
        annos = datastore.get_annotation(doc_id, anno_name)
        for sentence in sentences:
            annos_in_sentence = find_xs_in_y(annos, sentence)
            if annos_in_sentence == []:
                continue
            prev = False
            for token in find_xs_in_y(tokens, sentence):
                if find_x_including_y(annos_in_sentence, token) is None:
                    prev = False
                    print('{0}\t{1}\t{2}'.format(
                        text[token['begin']:token['end']], token['POS'], 'O'))
                else:
                    if prev:
                        print('{0}\t{1}\tI-{2}'.format(
                            text[token['begin']:token['end']], token['POS'],
                            anno_name))
                    else:
                        print('{0}\t{1}\tB-{2}'.format(
                            text[token['begin']:token['end']], token['POS'],
Exemplo n.º 7
0
import sqlitedatastore as datastore
from annoutil import find_xs_in_y

if __name__ == '__main__':
    datastore.connect()
    # ラベル付きデータ読み込み
    sentences = []
    labels = []
    with open('./data/labels.txt') as f:
        for line in f:
            if line.startswith('#'):
                continue
            d = line.rstrip().split()
            label, doc_id, sent_id = int(d[0]), d[1], int(d[2])
            sent = datastore.get_annotation(doc_id, 'sentence')[sent_id]
            tokens = find_xs_in_y(datastore.get_annotation(doc_id, 'token'),
                                  sent)
            sentences.append((doc_id, sent, tokens))
            labels.append(label)

    # 学習データ特徴量生成
    num_train = int(len(sentences) * 0.8)
    sentences_train = sentences[:num_train]
    labels_train = labels[:num_train]
    features, vocab = mlclassifier.convert_into_features(sentences_train)

    # 学習
    time_s = time.time()
    print(':::TRAIN START')
    model = mlclassifier.train(labels_train, features)
    print(':::TRAIN FINISHED', time.time() - time_s)
Exemplo n.º 8
0
import ruleclassifier
import solrindexer as indexer
import sqlitedatastore as datastore
from annoutil import find_xs_in_y

if __name__ == '__main__':
    datastore.connect()
    results = indexer.search_annotation(fl_keyword_pairs=[
        ('name_s', [['sentence']]),
    ],
                                        rows=3000)
    sentences = []
    for r in results['response']['docs']:
        sent = datastore.get_annotation(r['doc_id_i'],
                                        'sentence')[r['anno_id_i']]
        tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'),
                              sent)
        sentences.append((r['doc_id_i'], sent, tokens))

    # ルール取得
    rule = ruleclassifier.get_rule()

    # 分類
    features = ruleclassifier.convert_into_features_using_rules(
        sentences, rule)
    predicteds = ruleclassifier.classify(features, rule)
    for predicted, (doc_id, sent, tokens) in zip(predicteds, sentences):
        if predicted == 1:
            text = datastore.get(doc_id, ['content'])['content']
            print(predicted, text[sent['begin']:sent['end']])
    datastore.close()
Exemplo n.º 9
0
import sqlitedatastore as datastore
from annoutil import find_xs_in_y

if __name__ == '__main__':
    datastore.connect()
    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, fl=['content'])
        text = row['content']
        sentences = datastore.get_annotation(doc_id, 'sentence')
        tokens = datastore.get_annotation(doc_id, 'token')
        for sentence in sentences:
            for token in find_xs_in_y(tokens, sentence):
                print('{0}\t{1}\t{2}\t{3}\t{4}'.format(
                    text[token['begin']:token['end']], token['POS'], doc_id,
                    token['begin'], token['end']))
            print()  # 文の区切り
    datastore.close()
Exemplo n.º 10
0
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

from annoutil import find_xs_in_y
import sqlitedatastore as datastore

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

if __name__ == '__main__':
    datastore.connect()
    sentences = []
    for doc_id in datastore.get_all_ids(limit=-1):
        all_tokens = datastore.get_annotation(doc_id, 'token')
        for sent in datastore.get_annotation(doc_id, 'sentence'):
            tokens = find_xs_in_y(all_tokens, sent)
            sentences.append(
                [token['lemma'] for token in tokens if token.get('NE') == 'O'])

    n_sent = 20
    docs = [
        list(itertools.chain.from_iterable(sentences[i:i + n_sent]))
        for i in range(0, len(sentences), n_sent)
    ]

    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=2, no_above=0.3)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    lda = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)
Exemplo n.º 11
0
def extract_relation(doc_id):
    text = datastore.get(doc_id, fl=['content'])['content']
    all_chunks = datastore.get_annotation(doc_id, 'chunk')
    all_tokens = datastore.get_annotation(doc_id, 'token')
    anno_id = 0
    for sent in datastore.get_annotation(doc_id, 'sentence'):
        chunks = find_xs_in_y(all_chunks, sent)
        tokens = find_xs_in_y(all_tokens, sent)
        for chunk in chunks:
            chunk_tokens = find_xs_in_y(tokens, chunk)
            if not any([
                    chunk_token['lemma'] == '与える'
                    for chunk_token in chunk_tokens
            ]):
                continue

            affect, affect_tokens = find_child(chunk,
                                               chunks,
                                               tokens,
                                               text,
                                               all_chunks,
                                               child_cond={'text': ['影響を']})
            if affect is None:
                continue

            cause, cause_tokens = find_child(chunk,
                                             chunks,
                                             tokens,
                                             text,
                                             all_chunks,
                                             child_cond={
                                                 'pos1': ['助詞'],
                                                 'lemma1': ['は', 'も', 'が'],
                                                 'pos2_ng': ['助詞'],
                                             })
            if cause is None:
                continue

            effect, effect_tokens = find_child(chunk,
                                               chunks,
                                               tokens,
                                               text,
                                               all_chunks,
                                               child_cond={
                                                   'pos1': ['助詞'],
                                                   'lemma1': ['に'],
                                                   'pos2_ng': ['助詞'],
                                               })
            if effect is None:
                continue

            cause = extend_phrase(cause, cause_tokens, tokens, all_chunks)
            effect = extend_phrase(effect, effect_tokens, tokens, all_chunks)

            relation = {
                'cause': {
                    'begin': cause['begin'],
                    'end': cause['end'],
                    'link': ('effect', anno_id),
                },
                'effect': {
                    'begin': effect['begin'],
                    'end': effect['end'],
                }
            }

            anno_id += 1
            yield sent, relation