Пример #1
0
def add_links_from_file(filename,
                        link_type,
                        file_type,
                        config_file,
                        is_to_print_time,
                        is_to_add_several_lexemes,
                        is_dry_run,
                        revset_id=None,
                        comment=""):
    start = datetime.datetime.now()

    annotation_editor = AnnotationEditor(config_file)
    link_list = parse_links_from_file(filename, link_type, file_type)

    if comment == "":
        comment = os.path.basename(filename)
    add_links(annotation_editor, link_list, revset_id, comment,
              is_to_add_several_lexemes)

    if not is_dry_run:
        annotation_editor.commit()

    if is_to_print_time:
        print('time elapsed for add_links_from_file:{0}'.format(
            datetime.datetime.now() - start))
Пример #2
0
def add_links_from_file(filename, link_type, file_type, config_file, is_to_print_time, is_to_add_several_lexemes, is_dry_run, revset_id=None, comment=""):
    start = datetime.datetime.now()
        
    annotation_editor = AnnotationEditor(config_file)
    link_list = parse_links_from_file(filename, link_type, file_type)

    if comment == "":
        comment = os.path.basename(filename)
    add_links(annotation_editor, link_list, revset_id, comment, is_to_add_several_lexemes)

    if not is_dry_run:
        annotation_editor.commit()
    
    if is_to_print_time:
        print('time elapsed for add_links_from_file:{0}'.format(datetime.datetime.now() - start))
Пример #3
0
    "SELECT complex.parent_gid, complex.child_gid, COUNT(child_gid) as children FROM anaphora_syntax_groups AS comp_g INNER JOIN anaphora_syntax_groups_complex AS complex ON complex.parent_gid = comp_g.group_id INNER JOIN anaphora_syntax_groups AS simp_g ON simp_g.group_id = complex.child_gid INNER JOIN anaphora_syntax_groups_simple ON anaphora_syntax_groups_simple.group_id = complex.child_gid WHERE comp_g.head_id = 0 AND comp_g.group_type = 10 AND simp_g.group_type = 1 GROUP BY parent_gid HAVING children = 2"
)

supplement = editor.db_cursor.fetchall()

for row in supplement:
    editor.db_cursor.execute(
        "SELECT MIN (child_gid) as first_group FROM supplement WHERE parent_gid = " + row["parent_gid"]
    )
    first_base = editor.db_cursor.fetchone()
    editor.db_cursor.execute(
        "UPDATE anaphora_syntax_groups SET head_id = "
        + str(first_base["first_group"])
        + " WHERE group_id = "
        + str(row["parent_gid"])
    )


# все оставшиеся сложные группы: предложные, собственные наименования, несобственные наименования, приложения, именные - плохие группы
editor.db_cursor.execute(
    "SELECT anaphora_syntax_groups_complex.parent_gid, anaphora_syntax_groups.group_type FROM anaphora_syntax_groups_complex INNER JOIN anaphora_syntax_groups ON anaphora_syntax_groups_complex.parent_gid = anaphora_syntax_groups.group_id WHERE head_id = 0 AND group_type IN (8, 9, 10, 11, 13)"
)

bad_complex_groups = editor.db_cursor.fetchall()
for row in bad_complex_groups:
    editor.db_cursor.execute(
        "UPDATE anaphora_syntax_groups SET marks = 'bad' WHERE group_id = " + str(row["parent_gid"])
    )

editor.commit()
Пример #4
0
# в сложном несобственном наименовании, где есть только одна базовая группа, поставить вершиной базовую группу
editor.db_cursor.execute("SELECT complex.parent_gid, complex.child_gid, anaphora_syntax_groups_simple.token_id, COUNT(child_gid) as children FROM anaphora_syntax_groups AS comp_g INNER JOIN anaphora_syntax_groups_complex AS complex ON complex.parent_gid = comp_g.group_id INNER JOIN anaphora_syntax_groups AS simp_g ON simp_g.group_id = complex.child_gid INNER JOIN anaphora_syntax_groups_simple ON anaphora_syntax_groups_simple.group_id = complex.child_gid INNER JOIN tf_revisions ON tf_revisions.tf_id = anaphora_syntax_groups_simple.token_id WHERE tf_revisions.is_last = 1 AND comp_g.head_id = 0 AND comp_g.group_type = 9 AND simp_g.group_type = 1 GROUP BY parent_gid HAVING children = 1")

base_in_name = editor.db_cursor.fetchall()

for row in base_in_name:
    editor.db_cursor.execute("UPDATE anaphora_syntax_groups SET head_id = " + str(row['child_gid']) + " WHERE group_id = " + str(row['parent_gid']))


# в приложениях, где ровно 2 базовые группы, поставить вершиной первую базовую группу
editor.db_cursor.execute("SELECT complex.parent_gid, complex.child_gid, COUNT(child_gid) as children FROM anaphora_syntax_groups AS comp_g INNER JOIN anaphora_syntax_groups_complex AS complex ON complex.parent_gid = comp_g.group_id INNER JOIN anaphora_syntax_groups AS simp_g ON simp_g.group_id = complex.child_gid INNER JOIN anaphora_syntax_groups_simple ON anaphora_syntax_groups_simple.group_id = complex.child_gid WHERE comp_g.head_id = 0 AND comp_g.group_type = 10 AND simp_g.group_type = 1 GROUP BY parent_gid HAVING children = 2")

supplement = editor.db_cursor.fetchall()

for row in supplement:
    editor.db_cursor.execute("SELECT MIN (child_gid) as first_group FROM supplement WHERE parent_gid = " + row['parent_gid'])
    first_base = editor.db_cursor.fetchone()
    editor.db_cursor.execute("UPDATE anaphora_syntax_groups SET head_id = " + str(first_base['first_group']) + " WHERE group_id = " + str(row['parent_gid']))


# все оставшиеся сложные группы: предложные, собственные наименования, несобственные наименования, приложения, именные - плохие группы
editor.db_cursor.execute("SELECT anaphora_syntax_groups_complex.parent_gid, anaphora_syntax_groups.group_type FROM anaphora_syntax_groups_complex INNER JOIN anaphora_syntax_groups ON anaphora_syntax_groups_complex.parent_gid = anaphora_syntax_groups.group_id WHERE head_id = 0 AND group_type IN (8, 9, 10, 11, 13)")

bad_complex_groups = editor.db_cursor.fetchall()
for row in bad_complex_groups:
    editor.db_cursor.execute("UPDATE anaphora_syntax_groups SET marks = 'bad' WHERE group_id = " + str(row['parent_gid']))

editor.commit()

Пример #5
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    update_annotation(editor)

    if 'debug' not in sys.argv:
        editor.commit()
Пример #6
0
def main():
    editor = AnnotationEditor('/corpus/config.ini')
    process(sys.argv[1], editor)
    editor.commit()
Пример #7
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    merge(editor.db_cursor, sys.argv[1], sys.argv[2])
    editor.commit()
Пример #8
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    dbh = editor.db_cursor

    # all entity entries
    dbh.execute("""
    SELECT ne_entities.*, user_id, sent_id, GROUP_CONCAT(tag_id ORDER BY tag_id SEPARATOR ' ') AS tags
        FROM ne_entities 
            LEFT JOIN ne_paragraphs USING (annot_id)
            LEFT JOIN tokens ON start_token = tf_id
            LEFT JOIN ne_entity_tags USING (entity_id)
        GROUP BY entity_id
        ORDER BY sent_id, start_token, updated_ts
    """)
    results = dbh.fetchall()
    user_ids = sent_ids = set()
    for row in results:
        user_ids.add(str(row['user_id']))
        sent_ids.add(str(row['sent_id']))

    # collect users separately
    dbh.execute("""
    SELECT user_id, user_name, user_shown_name
        FROM users
        WHERE user_id IN ({0})
    """.format(", ".join(user_ids)))
    users_res = dbh.fetchall()
    users = {}
    for user in users_res:
        if len(user["user_shown_name"]) > 0:
            user_name = user["user_shown_name"]
        else:
            user_name = user["user_name"]
        users[user["user_id"]] = user_name

    # collect all tokens from required sentences
    dbh.execute("""
    SELECT *
        FROM tokens
        WHERE sent_id IN ({0})
        ORDER BY sent_id, pos
    """.format(", ".join(sent_ids)))
    sent_res = dbh.fetchall()
    sentences = {}
    for sent in sent_res:
        if sent["sent_id"] not in sentences:
            sentences[sent["sent_id"]] = []
        sentences[sent["sent_id"]].append(sent)

    # collect tag names
    dbh.execute("SELECT * FROM ne_tags")
    tags_res = dbh.fetchall()
    tags = {}
    for tag in tags_res:
        tags[tag["tag_id"]] = tag["tag_name"]

    # output
    for row in results:
        out = ""
        out += str(row["entity_id"]) + "\t"
        out += str(row["sent_id"]) + "\t"
        out += datetime.fromtimestamp(
            row["updated_ts"]).strftime("%b %d, %H:%M") + "\t"
        out += users[row["user_id"]] + "\t"
        for tag in row["tags"].split():
            out += tags[int(tag)] + " "
        out += "\t"
        tokens_all = tokens = ""
        ne_len = 0
        for tkn in sentences[row["sent_id"]]:
            txt = tkn["tf_text"] + " "
            # these are ne tokens
            if tkn["tf_id"] == row["start_token"] or (ne_len > 0 and
                                                      ne_len < row["length"]):
                tokens += txt
                ne_len += 1
            # all tokens for context
            tokens_all += txt
        out += tokens + "\t"
        out += tokens_all + "\t"
        print out.encode('UTF-8')

    if 'debug' not in sys.argv:
        editor.commit()
Пример #9
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    dbh = editor.db_cursor

    # all entity entries
    dbh.execute("""
    SELECT ne_entities.*, user_id, sent_id, GROUP_CONCAT(tag_id ORDER BY tag_id SEPARATOR ' ') AS tags
        FROM ne_entities 
            LEFT JOIN ne_paragraphs USING (annot_id)
            LEFT JOIN tokens ON start_token = tf_id
            LEFT JOIN ne_entity_tags USING (entity_id)
        GROUP BY entity_id
        ORDER BY sent_id, start_token, updated_ts
    """)
    results = dbh.fetchall()
    user_ids = sent_ids = set()
    for row in results:
        user_ids.add(str(row['user_id']))
        sent_ids.add(str(row['sent_id']))

    # collect users separately
    dbh.execute("""
    SELECT user_id, user_name, user_shown_name
        FROM users
        WHERE user_id IN ({0})
    """.format(", ".join(user_ids)))
    users_res = dbh.fetchall()
    users = {}
    for user in users_res:
        if len(user["user_shown_name"]) > 0:
            user_name = user["user_shown_name"]
        else:
            user_name = user["user_name"]
        users[user["user_id"]] = user_name

    # collect all tokens from required sentences
    dbh.execute("""
    SELECT *
        FROM tokens
        WHERE sent_id IN ({0})
        ORDER BY sent_id, pos
    """.format(", ".join(sent_ids)))
    sent_res = dbh.fetchall()
    sentences = {}
    for sent in sent_res:
        if sent["sent_id"] not in sentences:
            sentences[sent["sent_id"]] = []
        sentences[sent["sent_id"]].append(sent)

    # collect tag names
    dbh.execute("SELECT * FROM ne_tags")
    tags_res = dbh.fetchall()
    tags = {}
    for tag in tags_res:
        tags[tag["tag_id"]] = tag["tag_name"]

    # output
    for row in results:
        out = ""
        out += str(row["entity_id"]) + "\t"
        out += str(row["sent_id"]) + "\t"
        out += datetime.fromtimestamp(row["updated_ts"]).strftime("%b %d, %H:%M") + "\t"
        out += users[row["user_id"]] + "\t"
        for tag in row["tags"].split():
            out += tags[int(tag)] + " "
        out += "\t"
        tokens_all = tokens = ""
        ne_len = 0
        for tkn in sentences[row["sent_id"]]:
            txt = tkn["tf_text"] + " "
            # these are ne tokens
            if tkn["tf_id"] == row["start_token"] or (ne_len > 0 and ne_len < row["length"]):
                tokens += txt
                ne_len+=1
            # all tokens for context
            tokens_all += txt
        out += tokens + "\t"
        out += tokens_all + "\t"
        print out.encode('UTF-8')

    if 'debug' not in sys.argv:
        editor.commit()
Пример #10
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    merge(editor.db_cursor, sys.argv[1], sys.argv[2])
    editor.commit()