def add_links_from_file(filename, link_type, file_type, config_file, is_to_print_time, is_to_add_several_lexemes, is_dry_run, revset_id=None, comment=""): start = datetime.datetime.now() annotation_editor = AnnotationEditor(config_file) link_list = parse_links_from_file(filename, link_type, file_type) if comment == "": comment = os.path.basename(filename) add_links(annotation_editor, link_list, revset_id, comment, is_to_add_several_lexemes) if not is_dry_run: annotation_editor.commit() if is_to_print_time: print('time elapsed for add_links_from_file:{0}'.format( datetime.datetime.now() - start))
def add_links_from_file(filename, link_type, file_type, config_file, is_to_print_time, is_to_add_several_lexemes, is_dry_run, revset_id=None, comment=""): start = datetime.datetime.now() annotation_editor = AnnotationEditor(config_file) link_list = parse_links_from_file(filename, link_type, file_type) if comment == "": comment = os.path.basename(filename) add_links(annotation_editor, link_list, revset_id, comment, is_to_add_several_lexemes) if not is_dry_run: annotation_editor.commit() if is_to_print_time: print('time elapsed for add_links_from_file:{0}'.format(datetime.datetime.now() - start))
def main(): editor = AnnotationEditor(CONFIG_PATH) only_moderated = False if len(sys.argv) < 2 or sys.argv[1] not in ['simple', 'complex', 'both']: sys.stderr.write( """Usage: {0} {{simple|complex|both}} [mod]\n\tmod: export only moderators' groups, otherwise first user's annotation for each text\n""" .format(sys.argv[0])) sys.exit(1) if len(sys.argv) > 2 and sys.argv[2] == 'mod': only_moderated = True do_export(editor.db_cursor, sys.argv[1], only_moderated)
def find_adv_comp_pairs(config_file): adj_pairs = [] annotation_editor = AnnotationEditor(config_file) res = annotation_editor.find_lexeme_by_lemma(u'%' + ENDING_COMPARATIVE, [COMPARATIVE_GLOSS], lemma_is_regex=True) for lexeme in res: lexeme_text = lexeme.lemma['text'].decode('utf-8') adv_stem = get_adverb_stem(lexeme_text) adverbs_positive_forms = annotation_editor.find_lexeme_by_lemma(adv_stem, [ADVERB_GLOSS]) for adverbs_positive_form in adverbs_positive_forms: adverb_positive_lexeme_id = adverbs_positive_form._id adverb_positive_lemma = adverbs_positive_form.lemma['text'].decode('utf-8') adj_pairs.append(({'id':lexeme._id, 'text':lexeme_text}, {'id':adverb_positive_lexeme_id, 'text':adverb_positive_lemma})) return adj_pairs
# -*- coding: utf-8 -*- import sys sys.path.append("/corpus/python") from Annotation import AnnotationEditor editor = AnnotationEditor("config.ini") editor.db_cursor.execute("SHOW columns FROM anaphora_syntax_groups") has_marks = False rows = editor.db_cursor.fetchall() # добавлять колонку "marks" только если её не существует for row in rows: if row["Field"] == "marks": has_marks = True break if not has_marks: editor.db_cursor.execute("ALTER TABLE anaphora_syntax_groups ADD marks ENUM('bad', 'suspicious', 'no head', 'all')") """ ============================== ПРОСТЫЕ ГРУППЫ ============================== """ # убрать вершины, добавить тэг "нет вершины" в вводных выражениях, сложных союзах, сложных предлогах, наречных выражениях
# -*- coding: utf-8 -*- import sys sys.path.append('/corpus/python') from Annotation import AnnotationEditor editor = AnnotationEditor('config.ini') editor.db_cursor.execute("SHOW columns FROM anaphora_syntax_groups") has_marks = False rows = editor.db_cursor.fetchall() # добавлять колонку "marks" только если её не существует for row in rows: if row['Field'] == 'marks': has_marks = True break if not has_marks: editor.db_cursor.execute("ALTER TABLE anaphora_syntax_groups ADD marks ENUM('bad', 'suspicious', 'no head', 'all')") """ ============================== ПРОСТЫЕ ГРУППЫ ============================== """ # убрать вершины, добавить тэг "нет вершины" в вводных выражениях, сложных союзах, сложных предлогах, наречных выражениях editor.db_cursor.execute("SELECT anaphora_syntax_groups_simple.group_id, group_type FROM anaphora_syntax_groups_simple INNER JOIN anaphora_syntax_groups ON anaphora_syntax_groups_simple.group_id = anaphora_syntax_groups.group_id WHERE group_type IN (4, 5, 6, 7)")
def main(): editor = AnnotationEditor(CONFIG_PATH) do_export(editor.db_cursor)
def main(): editor = AnnotationEditor(CONFIG_PATH) update_annotation(editor) if 'debug' not in sys.argv: editor.commit()
def main(): editor = AnnotationEditor('/corpus/config.ini') process(sys.argv[1], editor) editor.commit()
def main(): editor = AnnotationEditor(CONFIG_PATH) merge(editor.db_cursor, sys.argv[1], sys.argv[2]) editor.commit()
def main(): editor = AnnotationEditor(CONFIG_PATH) dbh = editor.db_cursor # all entity entries dbh.execute(""" SELECT ne_entities.*, user_id, sent_id, GROUP_CONCAT(tag_id ORDER BY tag_id SEPARATOR ' ') AS tags FROM ne_entities LEFT JOIN ne_paragraphs USING (annot_id) LEFT JOIN tokens ON start_token = tf_id LEFT JOIN ne_entity_tags USING (entity_id) GROUP BY entity_id ORDER BY sent_id, start_token, updated_ts """) results = dbh.fetchall() user_ids = sent_ids = set() for row in results: user_ids.add(str(row['user_id'])) sent_ids.add(str(row['sent_id'])) # collect users separately dbh.execute(""" SELECT user_id, user_name, user_shown_name FROM users WHERE user_id IN ({0}) """.format(", ".join(user_ids))) users_res = dbh.fetchall() users = {} for user in users_res: if len(user["user_shown_name"]) > 0: user_name = user["user_shown_name"] else: user_name = user["user_name"] users[user["user_id"]] = user_name # collect all tokens from required sentences dbh.execute(""" SELECT * FROM tokens WHERE sent_id IN ({0}) ORDER BY sent_id, pos """.format(", ".join(sent_ids))) sent_res = dbh.fetchall() sentences = {} for sent in sent_res: if sent["sent_id"] not in sentences: sentences[sent["sent_id"]] = [] sentences[sent["sent_id"]].append(sent) # collect tag names dbh.execute("SELECT * FROM ne_tags") tags_res = dbh.fetchall() tags = {} for tag in tags_res: tags[tag["tag_id"]] = tag["tag_name"] # output for row in results: out = "" out += str(row["entity_id"]) + "\t" out += str(row["sent_id"]) + "\t" out += datetime.fromtimestamp( row["updated_ts"]).strftime("%b %d, %H:%M") + "\t" out += users[row["user_id"]] + "\t" for tag in row["tags"].split(): out += tags[int(tag)] + " " out += "\t" tokens_all = tokens = "" ne_len = 0 for tkn in sentences[row["sent_id"]]: txt = tkn["tf_text"] + " " # these are ne tokens if tkn["tf_id"] == row["start_token"] or (ne_len > 0 and ne_len < row["length"]): tokens += txt ne_len += 1 # all tokens for context tokens_all += txt out += tokens + "\t" out += tokens_all + "\t" print out.encode('UTF-8') if 'debug' not in sys.argv: editor.commit()
def main(): editor = AnnotationEditor(CONFIG_PATH) dbh = editor.db_cursor # all entity entries dbh.execute(""" SELECT ne_entities.*, user_id, sent_id, GROUP_CONCAT(tag_id ORDER BY tag_id SEPARATOR ' ') AS tags FROM ne_entities LEFT JOIN ne_paragraphs USING (annot_id) LEFT JOIN tokens ON start_token = tf_id LEFT JOIN ne_entity_tags USING (entity_id) GROUP BY entity_id ORDER BY sent_id, start_token, updated_ts """) results = dbh.fetchall() user_ids = sent_ids = set() for row in results: user_ids.add(str(row['user_id'])) sent_ids.add(str(row['sent_id'])) # collect users separately dbh.execute(""" SELECT user_id, user_name, user_shown_name FROM users WHERE user_id IN ({0}) """.format(", ".join(user_ids))) users_res = dbh.fetchall() users = {} for user in users_res: if len(user["user_shown_name"]) > 0: user_name = user["user_shown_name"] else: user_name = user["user_name"] users[user["user_id"]] = user_name # collect all tokens from required sentences dbh.execute(""" SELECT * FROM tokens WHERE sent_id IN ({0}) ORDER BY sent_id, pos """.format(", ".join(sent_ids))) sent_res = dbh.fetchall() sentences = {} for sent in sent_res: if sent["sent_id"] not in sentences: sentences[sent["sent_id"]] = [] sentences[sent["sent_id"]].append(sent) # collect tag names dbh.execute("SELECT * FROM ne_tags") tags_res = dbh.fetchall() tags = {} for tag in tags_res: tags[tag["tag_id"]] = tag["tag_name"] # output for row in results: out = "" out += str(row["entity_id"]) + "\t" out += str(row["sent_id"]) + "\t" out += datetime.fromtimestamp(row["updated_ts"]).strftime("%b %d, %H:%M") + "\t" out += users[row["user_id"]] + "\t" for tag in row["tags"].split(): out += tags[int(tag)] + " " out += "\t" tokens_all = tokens = "" ne_len = 0 for tkn in sentences[row["sent_id"]]: txt = tkn["tf_text"] + " " # these are ne tokens if tkn["tf_id"] == row["start_token"] or (ne_len > 0 and ne_len < row["length"]): tokens += txt ne_len+=1 # all tokens for context tokens_all += txt out += tokens + "\t" out += tokens_all + "\t" print out.encode('UTF-8') if 'debug' not in sys.argv: editor.commit()
def main(): editor = AnnotationEditor(CONFIG_PATH) do_import(editor, sys.stdin)