def find_matches(data): # Look for matches in all sentences # Load the role pattern class send('Find matches request received') send('Loading pattern') pattern_id = data['pattern_id'] role_pattern = db.load_role_pattern(pattern_id) send('Finding matches') # Init a minimal vocab to save on deserialisation and memory vocab = util.init_vocab() sentence_ids = db.get_ids('sentences') match_ids = [] for sentence_id in sentence_ids: doc = db.load_sentence_doc(sentence_id, vocab) for token in doc: print(token, token._.valence) matches = role_pattern.match(doc) for match in matches: slots, match_tokens = db.despacify_match(match, sentence_id) match_row = { 'sentence_id': sentence_id, 'data': json.dumps({ 'slots': slots, 'match_tokens': match_tokens }) } match_id = db.insert_row('matches', match_row) match_ids.append(match_id) pattern_match_row = { 'match_id': match_id, 'pattern_id': pattern_id, } db.insert_row('pattern_matches', pattern_match_row) send('Matches saved. IDs: {}'.format(match_ids)) emit('find_matches_success')
def refine_pattern(data): send('refine pattern request received') send('Loading pattern') pattern_id = data['pattern_id'] feature_dict = data.get('feature_dict') if not feature_dict: feature_dict = DEFAULT_REFINE_PATTERN_FEATURE_DICT role_pattern = db.load_role_pattern(pattern_id) send('Loading matches') pos_match_id = data['pos_match_id'] neg_match_ids = data['neg_match_ids'] pos_match_row = db.fetch_row('matches', pos_match_id, return_type='dict') if not pos_match_row: emit('error', 'no row found for pos match id: {}'.format(pos_match_id)) neg_match_rows = db.fetch_rows('matches', neg_match_ids, return_type='dict') for id_, row in zip(neg_match_ids, neg_match_rows): if not row: emit('error', 'no row found for neg match id: {}'.format(id_)) send('preparing training data') pos_match_sentence_id = pos_match_row['sentence_id'] pos_match = json.loads(pos_match_row['data']) pos_match = db.spacify_match(pos_match, pos_match_sentence_id) neg_matches = [] for neg_match_row in neg_match_rows: sentence_id = neg_match_row['sentence_id'] neg_match = json.loads(neg_match_row['data']) neg_match = db.spacify_match(neg_match, sentence_id) neg_matches.append(neg_match) send('calculating pattern') feature_dict = {'DEP': 'dep_', 'TAG': 'tag_', 'LOWER': 'lower_'} role_pattern_builder = RolePatternBuilder(feature_dict) role_pattern_variants = role_pattern_builder.refine( role_pattern, pos_match, neg_matches) role_pattern_variants = list(role_pattern_variants) try: # Try to take the first pattern refined_pattern = role_pattern_variants[0] except IndexError as e: # None meet the criteria refined_pattern = None if refined_pattern: send('success. saving pattern') pattern_row = { 'name': 'unamed_pattern', 'role_pattern_instance': pickle.dumps(role_pattern), } pattern_id = db.insert_row('patterns', pattern_row) send('pattern saved: {}'.format(pattern_id)) else: send('pattern refinement unsuccessful') emit('refine_pattern_success')
def visualise_pattern(data): pattern_id = data['pattern_id'] send('Loading pattern') role_pattern = db.load_role_pattern(pattern_id) pprint(role_pattern.spacy_dep_pattern) send('Generating DOT') node_attrs = role_pattern_vis.DEFAULT_NODE_ATTRS # for token in doc: # token._.plot.update(node_attrs) # token._.plot['label'] = '{0} [{1}]\n({2})'.format(token.orth_, token.i, token.tag_) graph, legend = role_pattern.to_pydot(legend=True) graph, legend = graph.to_string(), legend.to_string() dot_data = { 'graph': graph, 'legend': legend, } emit('visualise_pattern_success', dot_data)
progress = util.read_progress() n_patterns_to_insert = len(pattern_ids_to_insert) for pattern_id in pattern_ids_to_insert: if pattern_id not in pattern_ids_to_insert: continue if pattern_id in progress['pattern_ids_inserted']: continue print('pattern_id', pattern_id) # Load RolePattern role_pattern_path = os.path.join(config['patterns_output_dir'], '{}.p'.format(pattern_id)) try: with open(role_pattern_path, 'rb') as f: role_pattern = pickle.load(f) except: role_pattern = db.load_role_pattern(pattern_id) token_labels = role_pattern.token_labels role_pattern_bytes = pickle.dumps(role_pattern) pattern_row = { 'id': pattern_id, 'role_pattern_instance': role_pattern_bytes, 'data': json.dumps({'token_labels': token_labels}), } pattern_id = db.insert_row('patterns', pattern_row, db_path=config['new_db_file_path']) progress['pattern_ids_inserted'].append(pattern_id) print(len(progress['pattern_ids_inserted']), '/', n_patterns_to_insert) util.write_progress(progress)