def get(): keywords = bottle.request.params.keywords.split() classifier_name = bottle.request.params.classifier results = indexer.search_annotation(fl_keyword_pairs=[ ('sentence_txt_ja', [keywords]), ('name_s', [['sentence']]) ], rows=1000) for r in results['response']['docs']: sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'), sent) if classifier_name == 'ml': features = mlclassifier.convert_into_features_using_vocab( [(r['doc_id_i'], sent, tokens)], vocab_ml) predicteds = mlclassifier.classify(features, model_ml) elif classifier_name == 'dl': features = dlclassifier.convert_into_features_using_vocab( [(r['doc_id_i'], sent, tokens)], vocab_dl) predicteds = dlclassifier.classify(features, model_dl) elif classifier_name == 'rule': features = ruleclassifier.convert_into_features_using_rules( [(r['doc_id_i'], sent, tokens)], rule) predicteds = ruleclassifier.classify(features, rule) r['predicted'] = int(predicteds[0]) # covert from numpy.int to int print(r['predicted'], r['sentence_txt_ja']) return json.dumps(results, ensure_ascii=False)
def get(): doc_id = bottle.request.params.id names = bottle.request.params.names.split() row = datastore.get(doc_id, fl=['content']) text = row['content'] # text = re.sub(r'[。!]', '\n', text) data = { 'collection': { 'entity_types': [], }, 'annotation': { 'text': text, 'entities': [], 'relations': [], }, } mapping = {} for name in names: annos = datastore.get_annotation(doc_id, name) for i, anno in enumerate(annos): data['collection']['entity_types'].append({ 'type': name, 'bgColor': '#7fa2ff', 'borderColor': 'darken' }) Ti = 'T{0:d}'.format(len(data['annotation']['entities']) + 1) data['annotation']['entities'].append([ Ti, name, [[anno['begin'], anno['end']]] ]) mapping[(name, i)] = Ti for name in names: annos = datastore.get_annotation(doc_id, name) for i, anno in enumerate(annos): if 'link' not in anno: continue name_linked, i_linked = anno['link'] if (name, i) not in mapping or (name_linked, i_linked) not in mapping: continue data['annotation']['relations'].append([ 'R{0:d}'.format(len(data['annotation']['relations']) + 1), 'arg', [['src', mapping[(name, i)]], ['tgt', mapping[(name_linked, i_linked)]]] ]) return json.dumps(data, ensure_ascii=False)
def create_language_model(doc_ids, N=3): sents = [] for doc_id in doc_ids: all_tokens = datastore.get_annotation(doc_id, 'token') for sent in datastore.get_annotation(doc_id, 'sentence'): tokens = find_xs_in_y(all_tokens, sent) sents.append(['__BOS__'] + [token['lemma'] for token in tokens] + ['__EOS__']) vocab = Vocabulary([word for sent in sents for word in sent]) text_ngrams = [ngrams(sent, N) for sent in sents] lm = MLE(order=N, vocabulary=vocab) lm.fit(text_ngrams) return lm
def load_affiliation(): anno_name = 'affiliation' data = [] for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, ['content', 'meta_info']) text = row['content'] meta_info = json.loads(row['meta_info']) sents = datastore.get_annotation(doc_id, 'sentence') for i, anno in enumerate(datastore.get_annotation(doc_id, anno_name)): # Solr へ登録するデータ構造へ変換 sent = find_x_including_y(sents, anno) data.append({ 'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, anno_name, i), 'doc_id_i': doc_id, 'anno_id_i': i, 'name_s': anno_name, 'sentence_txt_ja': text[sent['begin']:sent['end']], anno_name + '_txt_ja': text[anno['begin']:anno['end']], 'title_txt_ja': meta_info['title'], 'url_s': meta_info['url'], }) # Solr への登録を実行 indexer.load('anno', data)
def create_index_data(doc_id, meta_info, anno_name, anno, i, sent, text): ref_anno_name, link = anno['link'] ref_anno = datastore.get_annotation(doc_id, ref_anno_name)[link] data = { 'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, anno_name, i), 'doc_id_i': doc_id, 'anno_id_i': i, 'name_s': anno_name, 'sentence_txt_ja': text[sent['begin']:sent['end']], anno_name + '_txt_ja': text[anno['begin']:anno['end']], ref_anno_name + '_txt_ja': text[ref_anno['begin']:ref_anno['end']], 'title_txt_ja': meta_info['title'], 'url_s': meta_info['url'], } return data
def get(): title = bottle.request.params.title.strip() keywords = bottle.request.params.keywords.split() results = indexer.search_annotation(fl_keyword_pairs=[ ('title_txt_ja', [[title]]), ('sentence_txt_ja', [keywords]), ('name_s', [['sentence']]) ], rows=1000) for r in results['response']['docs']: sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'), sent) features = sentimentclassifier.convert_into_features_using_vocab( [(r['doc_id_i'], sent, tokens)], vocab) predicteds = mlclassifier.classify(features, model) r['predicted'] = int(predicteds[0]) # covert from numpy.int to int print(r['predicted'], r['sentence_txt_ja']) return json.dumps(results, ensure_ascii=False)
def create_annotation(doc_id, ptn): row = datastore.get(doc_id, fl=['content']) text = row['content'] annos = [] for chunk in datastore.get_annotation(doc_id, 'chunk'): chunk_str = text[chunk['begin']:chunk['end']] m = ptn.search(chunk_str) if not m: continue anno = { 'begin': chunk['begin'] + m.start(), 'end': chunk['begin'] + m.end(), } print(text[anno['begin']:anno['end']]) annos.append(anno) return annos
def load_sentence(): data = [] for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, ['content', 'meta_info']) text = row['content'] meta_info = json.loads(row['meta_info']) for i, sent in enumerate(datastore.get_annotation(doc_id, 'sentence')): # Solr へ登録するデータ構造へ変換 data.append({ 'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, 'sentence', i), 'doc_id_i': doc_id, 'anno_id_i': i, 'name_s': 'sentence', 'sentence_txt_ja': text[sent['begin']:sent['end']], 'title_txt_ja': meta_info['title'], 'url_s': meta_info['url'], }) # Solr への登録を実行 indexer.load('anno', data)
import mlclassifier import sqlitedatastore as datastore from annoutil import find_xs_in_y if __name__ == '__main__': datastore.connect() # ラベル付きデータ読み込み sentences = [] labels = [] with open('./data/labels.txt') as f: for line in f: if line.startswith('#'): continue d = line.rstrip().split() label, doc_id, sent_id = int(d[0]), d[1], int(d[2]) sent = datastore.get_annotation(doc_id, 'sentence')[sent_id] tokens = find_xs_in_y(datastore.get_annotation(doc_id, 'token'), sent) sentences.append((doc_id, sent, tokens)) labels.append(label) # 学習データ特徴量生成 num_train = int(len(sentences) * 0.8) sentences_train = sentences[:num_train] labels_train = labels[:num_train] features, vocab = mlclassifier.convert_into_features(sentences_train) # 学習 time_s = time.time() print(':::TRAIN START') model = mlclassifier.train(labels_train, features)
import sentimentclassifier import sqlitedatastore as datastore from annoutil import find_xs_in_y if __name__ == '__main__': datastore.connect() # ラベル付きデータ読み込み sentences = [] labels = [] with open('data/labels_sentiment.txt') as f: for line in f: if line.startswith('#'): continue d = line.rstrip().split() label, doc_id, sent_id = int(d[0]), d[1], int(d[2]) sent = datastore.get_annotation(doc_id, 'sentence')[sent_id] tokens = find_xs_in_y( datastore.get_annotation(doc_id, 'token'), sent) sentences.append((doc_id, sent, tokens)) labels.append(label) # 学習データ特徴量生成 num_train = int(len(sentences) * 0.8) sentences_train = sentences[:num_train] labels_train = labels[:num_train] features, vocab = sentimentclassifier.convert_into_features(sentences_train) # 学習 time_s = time.time() print(':::TRAIN START') model = mlclassifier.train(labels_train, features)
import math from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel from annoutil import find_xs_in_y import sqlitedatastore as datastore logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if __name__ == '__main__': datastore.connect() sentences = [] for doc_id in datastore.get_all_ids(limit=-1): all_tokens = datastore.get_annotation(doc_id, 'token') for sent in datastore.get_annotation(doc_id, 'sentence'): tokens = find_xs_in_y(all_tokens, sent) sentences.append( [token['lemma'] for token in tokens if token.get('NE') == 'O']) n_sent = 20 docs = [ list(itertools.chain.from_iterable(sentences[i:i + n_sent])) for i in range(0, len(sentences), n_sent) ] dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=2, no_above=0.3) corpus = [dictionary.doc2bow(doc) for doc in docs]
import sqlitedatastore as datastore import triematcher as matcher from annoutil import find_xs_in_y if __name__ == '__main__': datastore.connect() dic_positive, dic_negative = matcher.get_sentiment_dictionaries() doc_id = 1 for sent in datastore.get_annotation(doc_id, 'sentence'): tokens = find_xs_in_y(datastore.get_annotation(doc_id, 'token'), sent) text = ''.join([token['lemma'] for token in tokens]) print(text, '-->') print('\tpositive:', matcher.search_terms(text, dic_positive)) print('\tnegative:', matcher.search_terms(text, dic_negative)) datastore.close()
'doc_id_i': doc_id, 'anno_id_i': i, 'name_s': anno_name, 'sentence_txt_ja': text[sent['begin']:sent['end']], anno_name + '_txt_ja': text[anno['begin']:anno['end']], ref_anno_name + '_txt_ja': text[ref_anno['begin']:ref_anno['end']], 'title_txt_ja': meta_info['title'], 'url_s': meta_info['url'], } return data if __name__ == '__main__': datastore.connect() anno_name = 'cause' data = [] for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, fl=['content', 'meta_info']) text = row['content'] meta_info = json.loads(row['meta_info']) sents = datastore.get_annotation(doc_id, 'sentence') for i, anno in enumerate(datastore.get_annotation(doc_id, anno_name)): sent = find_x_including_y(sents, anno) data.append( create_index_data(doc_id, meta_info, anno_name, anno, i, sent, text)) # Solr への登録を実行 indexer.load('anno', data) datastore.close()
import json from sklearn.feature_extraction.text import TfidfVectorizer import sqlitedatastore as datastore if __name__ == '__main__': datastore.connect() data = [] doc_ids = [] for doc_id in datastore.get_all_ids(limit=-1): data.append(' '.join([ token['lemma'] for token in datastore.get_annotation(doc_id, 'token') ])) doc_ids.append(doc_id) vectorizer = TfidfVectorizer(analyzer='word', max_df=0.9) vecs = vectorizer.fit_transform(data) for doc_id, vec in zip(doc_ids, vecs.toarray()): meta_info = json.loads( datastore.get(doc_id, ['meta_info'])['meta_info']) title = meta_info['title'] print(doc_id, title) for w_id, tfidf in sorted(enumerate(vec), key=lambda x: x[1], reverse=True)[:10]: lemma = vectorizer.get_feature_names()[w_id]
import sqlitedatastore as datastore from annoutil import find_x_including_y, find_xs_in_y if __name__ == '__main__': datastore.connect() anno_name = 'affiliation' for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, fl=['content']) text = row['content'] sentences = datastore.get_annotation(doc_id, 'sentence') tokens = datastore.get_annotation(doc_id, 'token') annos = datastore.get_annotation(doc_id, anno_name) for sentence in sentences: annos_in_sentence = find_xs_in_y(annos, sentence) if annos_in_sentence == []: continue prev = False for token in find_xs_in_y(tokens, sentence): if find_x_including_y(annos_in_sentence, token) is None: prev = False print('{0}\t{1}\t{2}'.format( text[token['begin']:token['end']], token['POS'], 'O')) else: if prev: print('{0}\t{1}\tI-{2}'.format( text[token['begin']:token['end']], token['POS'], anno_name)) else: print('{0}\t{1}\tB-{2}'.format( text[token['begin']:token['end']], token['POS'],
import sqlitedatastore as datastore import solrindexer as indexer if __name__ == '__main__': datastore.connect() print('#label', 'doc_id', 'sentence_id', 'text') results = indexer.search_annotation( fl_keyword_pairs=[ ('sentence_txt_ja', [['教育', '治安', '経済']]), ('name_s', [['sentence']]), ], rows=1000) for r in results['response']['docs']: text = datastore.get(r['doc_id_i'], ['content'])['content'] sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] # ラベルファイルのデータ構造へ変換 print(0, r['doc_id_i'], r['anno_id_i'], text[sent['begin']:sent['end']]) datastore.close()
#!/usr/bin/env python import sqlitedatastore as datastore if __name__ == '__main__': datastore.connect() for doc_id in datastore.get_all_ids(limit=3): row = datastore.get(doc_id, fl=['content']) text = row['content'] print('tokens:') for token in datastore.get_annotation(doc_id, 'token'): print(' ', token['POS'], '\t', text[token['begin']:token['end']]) print('chunks:') chunks = datastore.get_annotation(doc_id, 'chunk') for chunk in chunks: _, link = chunk['link'] print(' ', text[chunk['begin']:chunk['end']]) if link != -1: parent = chunks[link] print('\t-->', text[parent['begin']:parent['end']]) else: print('\t-->', 'None') print('sentences:') for sent in datastore.get_annotation(doc_id, 'sentence'): print(' ', text[sent['begin']:sent['end']]) datastore.close()
def extract_relation(doc_id): text = datastore.get(doc_id, fl=['content'])['content'] all_chunks = datastore.get_annotation(doc_id, 'chunk') all_tokens = datastore.get_annotation(doc_id, 'token') anno_id = 0 for sent in datastore.get_annotation(doc_id, 'sentence'): chunks = find_xs_in_y(all_chunks, sent) tokens = find_xs_in_y(all_tokens, sent) for chunk in chunks: chunk_tokens = find_xs_in_y(tokens, chunk) if not any([ chunk_token['lemma'] == '与える' for chunk_token in chunk_tokens ]): continue affect, affect_tokens = find_child(chunk, chunks, tokens, text, all_chunks, child_cond={'text': ['影響を']}) if affect is None: continue cause, cause_tokens = find_child(chunk, chunks, tokens, text, all_chunks, child_cond={ 'pos1': ['助詞'], 'lemma1': ['は', 'も', 'が'], 'pos2_ng': ['助詞'], }) if cause is None: continue effect, effect_tokens = find_child(chunk, chunks, tokens, text, all_chunks, child_cond={ 'pos1': ['助詞'], 'lemma1': ['に'], 'pos2_ng': ['助詞'], }) if effect is None: continue cause = extend_phrase(cause, cause_tokens, tokens, all_chunks) effect = extend_phrase(effect, effect_tokens, tokens, all_chunks) relation = { 'cause': { 'begin': cause['begin'], 'end': cause['end'], 'link': ('effect', anno_id), }, 'effect': { 'begin': effect['begin'], 'end': effect['end'], } } anno_id += 1 yield sent, relation
import ruleclassifier import solrindexer as indexer import sqlitedatastore as datastore from annoutil import find_xs_in_y if __name__ == '__main__': datastore.connect() results = indexer.search_annotation(fl_keyword_pairs=[ ('name_s', [['sentence']]), ], rows=3000) sentences = [] for r in results['response']['docs']: sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'), sent) sentences.append((r['doc_id_i'], sent, tokens)) # ルール取得 rule = ruleclassifier.get_rule() # 分類 features = ruleclassifier.convert_into_features_using_rules( sentences, rule) predicteds = ruleclassifier.classify(features, rule) for predicted, (doc_id, sent, tokens) in zip(predicteds, sentences): if predicted == 1: text = datastore.get(doc_id, ['content'])['content'] print(predicted, text[sent['begin']:sent['end']]) datastore.close()
import sqlitedatastore as datastore if __name__ == '__main__': datastore.connect() anno_name = 'affiliation' for doc_id in datastore.get_all_ids(limit=-1): text = datastore.get(doc_id, fl=['content'])['content'] with open('result/brat/{0}.txt'.format(doc_id), 'w') as f: f.write(text) with open('result/brat/{0}.ann'.format(doc_id), 'w') as f: for i, anno in enumerate( datastore.get_annotation(doc_id, anno_name)): f.write('T{0}\t{1} {2} {3}\t{4}\n'.format( i, 'affiliation', anno['begin'], anno['end'], text[anno['begin']:anno['end']])) datastore.close()