def add_links_from_file(filename, link_type, file_type, config_file, is_to_print_time, is_to_add_several_lexemes, is_dry_run, revset_id=None, comment=""): start = datetime.datetime.now() annotation_editor = AnnotationEditor(config_file) link_list = parse_links_from_file(filename, link_type, file_type) if comment == "": comment = os.path.basename(filename) add_links(annotation_editor, link_list, revset_id, comment, is_to_add_several_lexemes) if not is_dry_run: annotation_editor.commit() if is_to_print_time: print('time elapsed for add_links_from_file:{0}'.format( datetime.datetime.now() - start))
def add_links_from_file(filename, link_type, file_type, config_file, is_to_print_time, is_to_add_several_lexemes, is_dry_run, revset_id=None, comment=""): start = datetime.datetime.now() annotation_editor = AnnotationEditor(config_file) link_list = parse_links_from_file(filename, link_type, file_type) if comment == "": comment = os.path.basename(filename) add_links(annotation_editor, link_list, revset_id, comment, is_to_add_several_lexemes) if not is_dry_run: annotation_editor.commit() if is_to_print_time: print('time elapsed for add_links_from_file:{0}'.format(datetime.datetime.now() - start))
"SELECT complex.parent_gid, complex.child_gid, COUNT(child_gid) as children FROM anaphora_syntax_groups AS comp_g INNER JOIN anaphora_syntax_groups_complex AS complex ON complex.parent_gid = comp_g.group_id INNER JOIN anaphora_syntax_groups AS simp_g ON simp_g.group_id = complex.child_gid INNER JOIN anaphora_syntax_groups_simple ON anaphora_syntax_groups_simple.group_id = complex.child_gid WHERE comp_g.head_id = 0 AND comp_g.group_type = 10 AND simp_g.group_type = 1 GROUP BY parent_gid HAVING children = 2" ) supplement = editor.db_cursor.fetchall() for row in supplement: editor.db_cursor.execute( "SELECT MIN (child_gid) as first_group FROM supplement WHERE parent_gid = " + row["parent_gid"] ) first_base = editor.db_cursor.fetchone() editor.db_cursor.execute( "UPDATE anaphora_syntax_groups SET head_id = " + str(first_base["first_group"]) + " WHERE group_id = " + str(row["parent_gid"]) ) # все оставшиеся сложные группы: предложные, собственные наименования, несобственные наименования, приложения, именные - плохие группы editor.db_cursor.execute( "SELECT anaphora_syntax_groups_complex.parent_gid, anaphora_syntax_groups.group_type FROM anaphora_syntax_groups_complex INNER JOIN anaphora_syntax_groups ON anaphora_syntax_groups_complex.parent_gid = anaphora_syntax_groups.group_id WHERE head_id = 0 AND group_type IN (8, 9, 10, 11, 13)" ) bad_complex_groups = editor.db_cursor.fetchall() for row in bad_complex_groups: editor.db_cursor.execute( "UPDATE anaphora_syntax_groups SET marks = 'bad' WHERE group_id = " + str(row["parent_gid"]) ) editor.commit()
# в сложном несобственном наименовании, где есть только одна базовая группа, поставить вершиной базовую группу editor.db_cursor.execute("SELECT complex.parent_gid, complex.child_gid, anaphora_syntax_groups_simple.token_id, COUNT(child_gid) as children FROM anaphora_syntax_groups AS comp_g INNER JOIN anaphora_syntax_groups_complex AS complex ON complex.parent_gid = comp_g.group_id INNER JOIN anaphora_syntax_groups AS simp_g ON simp_g.group_id = complex.child_gid INNER JOIN anaphora_syntax_groups_simple ON anaphora_syntax_groups_simple.group_id = complex.child_gid INNER JOIN tf_revisions ON tf_revisions.tf_id = anaphora_syntax_groups_simple.token_id WHERE tf_revisions.is_last = 1 AND comp_g.head_id = 0 AND comp_g.group_type = 9 AND simp_g.group_type = 1 GROUP BY parent_gid HAVING children = 1") base_in_name = editor.db_cursor.fetchall() for row in base_in_name: editor.db_cursor.execute("UPDATE anaphora_syntax_groups SET head_id = " + str(row['child_gid']) + " WHERE group_id = " + str(row['parent_gid'])) # в приложениях, где ровно 2 базовые группы, поставить вершиной первую базовую группу editor.db_cursor.execute("SELECT complex.parent_gid, complex.child_gid, COUNT(child_gid) as children FROM anaphora_syntax_groups AS comp_g INNER JOIN anaphora_syntax_groups_complex AS complex ON complex.parent_gid = comp_g.group_id INNER JOIN anaphora_syntax_groups AS simp_g ON simp_g.group_id = complex.child_gid INNER JOIN anaphora_syntax_groups_simple ON anaphora_syntax_groups_simple.group_id = complex.child_gid WHERE comp_g.head_id = 0 AND comp_g.group_type = 10 AND simp_g.group_type = 1 GROUP BY parent_gid HAVING children = 2") supplement = editor.db_cursor.fetchall() for row in supplement: editor.db_cursor.execute("SELECT MIN (child_gid) as first_group FROM supplement WHERE parent_gid = " + row['parent_gid']) first_base = editor.db_cursor.fetchone() editor.db_cursor.execute("UPDATE anaphora_syntax_groups SET head_id = " + str(first_base['first_group']) + " WHERE group_id = " + str(row['parent_gid'])) # все оставшиеся сложные группы: предложные, собственные наименования, несобственные наименования, приложения, именные - плохие группы editor.db_cursor.execute("SELECT anaphora_syntax_groups_complex.parent_gid, anaphora_syntax_groups.group_type FROM anaphora_syntax_groups_complex INNER JOIN anaphora_syntax_groups ON anaphora_syntax_groups_complex.parent_gid = anaphora_syntax_groups.group_id WHERE head_id = 0 AND group_type IN (8, 9, 10, 11, 13)") bad_complex_groups = editor.db_cursor.fetchall() for row in bad_complex_groups: editor.db_cursor.execute("UPDATE anaphora_syntax_groups SET marks = 'bad' WHERE group_id = " + str(row['parent_gid'])) editor.commit()
def main(): editor = AnnotationEditor(CONFIG_PATH) update_annotation(editor) if 'debug' not in sys.argv: editor.commit()
def main(): editor = AnnotationEditor('/corpus/config.ini') process(sys.argv[1], editor) editor.commit()
def main(): editor = AnnotationEditor(CONFIG_PATH) merge(editor.db_cursor, sys.argv[1], sys.argv[2]) editor.commit()
def main(): editor = AnnotationEditor(CONFIG_PATH) dbh = editor.db_cursor # all entity entries dbh.execute(""" SELECT ne_entities.*, user_id, sent_id, GROUP_CONCAT(tag_id ORDER BY tag_id SEPARATOR ' ') AS tags FROM ne_entities LEFT JOIN ne_paragraphs USING (annot_id) LEFT JOIN tokens ON start_token = tf_id LEFT JOIN ne_entity_tags USING (entity_id) GROUP BY entity_id ORDER BY sent_id, start_token, updated_ts """) results = dbh.fetchall() user_ids = sent_ids = set() for row in results: user_ids.add(str(row['user_id'])) sent_ids.add(str(row['sent_id'])) # collect users separately dbh.execute(""" SELECT user_id, user_name, user_shown_name FROM users WHERE user_id IN ({0}) """.format(", ".join(user_ids))) users_res = dbh.fetchall() users = {} for user in users_res: if len(user["user_shown_name"]) > 0: user_name = user["user_shown_name"] else: user_name = user["user_name"] users[user["user_id"]] = user_name # collect all tokens from required sentences dbh.execute(""" SELECT * FROM tokens WHERE sent_id IN ({0}) ORDER BY sent_id, pos """.format(", ".join(sent_ids))) sent_res = dbh.fetchall() sentences = {} for sent in sent_res: if sent["sent_id"] not in sentences: sentences[sent["sent_id"]] = [] sentences[sent["sent_id"]].append(sent) # collect tag names dbh.execute("SELECT * FROM ne_tags") tags_res = dbh.fetchall() tags = {} for tag in tags_res: tags[tag["tag_id"]] = tag["tag_name"] # output for row in results: out = "" out += str(row["entity_id"]) + "\t" out += str(row["sent_id"]) + "\t" out += datetime.fromtimestamp( row["updated_ts"]).strftime("%b %d, %H:%M") + "\t" out += users[row["user_id"]] + "\t" for tag in row["tags"].split(): out += tags[int(tag)] + " " out += "\t" tokens_all = tokens = "" ne_len = 0 for tkn in sentences[row["sent_id"]]: txt = tkn["tf_text"] + " " # these are ne tokens if tkn["tf_id"] == row["start_token"] or (ne_len > 0 and ne_len < row["length"]): tokens += txt ne_len += 1 # all tokens for context tokens_all += txt out += tokens + "\t" out += tokens_all + "\t" print out.encode('UTF-8') if 'debug' not in sys.argv: editor.commit()
def main(): editor = AnnotationEditor(CONFIG_PATH) dbh = editor.db_cursor # all entity entries dbh.execute(""" SELECT ne_entities.*, user_id, sent_id, GROUP_CONCAT(tag_id ORDER BY tag_id SEPARATOR ' ') AS tags FROM ne_entities LEFT JOIN ne_paragraphs USING (annot_id) LEFT JOIN tokens ON start_token = tf_id LEFT JOIN ne_entity_tags USING (entity_id) GROUP BY entity_id ORDER BY sent_id, start_token, updated_ts """) results = dbh.fetchall() user_ids = sent_ids = set() for row in results: user_ids.add(str(row['user_id'])) sent_ids.add(str(row['sent_id'])) # collect users separately dbh.execute(""" SELECT user_id, user_name, user_shown_name FROM users WHERE user_id IN ({0}) """.format(", ".join(user_ids))) users_res = dbh.fetchall() users = {} for user in users_res: if len(user["user_shown_name"]) > 0: user_name = user["user_shown_name"] else: user_name = user["user_name"] users[user["user_id"]] = user_name # collect all tokens from required sentences dbh.execute(""" SELECT * FROM tokens WHERE sent_id IN ({0}) ORDER BY sent_id, pos """.format(", ".join(sent_ids))) sent_res = dbh.fetchall() sentences = {} for sent in sent_res: if sent["sent_id"] not in sentences: sentences[sent["sent_id"]] = [] sentences[sent["sent_id"]].append(sent) # collect tag names dbh.execute("SELECT * FROM ne_tags") tags_res = dbh.fetchall() tags = {} for tag in tags_res: tags[tag["tag_id"]] = tag["tag_name"] # output for row in results: out = "" out += str(row["entity_id"]) + "\t" out += str(row["sent_id"]) + "\t" out += datetime.fromtimestamp(row["updated_ts"]).strftime("%b %d, %H:%M") + "\t" out += users[row["user_id"]] + "\t" for tag in row["tags"].split(): out += tags[int(tag)] + " " out += "\t" tokens_all = tokens = "" ne_len = 0 for tkn in sentences[row["sent_id"]]: txt = tkn["tf_text"] + " " # these are ne tokens if tkn["tf_id"] == row["start_token"] or (ne_len > 0 and ne_len < row["length"]): tokens += txt ne_len+=1 # all tokens for context tokens_all += txt out += tokens + "\t" out += tokens_all + "\t" print out.encode('UTF-8') if 'debug' not in sys.argv: editor.commit()
def main(): editor = AnnotationEditor(CONFIG_PATH) merge(editor.db_cursor, sys.argv[1], sys.argv[2]) editor.commit()