Пример #1
0
def add_links_from_file(filename,
                        link_type,
                        file_type,
                        config_file,
                        is_to_print_time,
                        is_to_add_several_lexemes,
                        is_dry_run,
                        revset_id=None,
                        comment=""):
    start = datetime.datetime.now()

    annotation_editor = AnnotationEditor(config_file)
    link_list = parse_links_from_file(filename, link_type, file_type)

    if comment == "":
        comment = os.path.basename(filename)
    add_links(annotation_editor, link_list, revset_id, comment,
              is_to_add_several_lexemes)

    if not is_dry_run:
        annotation_editor.commit()

    if is_to_print_time:
        print('time elapsed for add_links_from_file:{0}'.format(
            datetime.datetime.now() - start))
Пример #2
0
def add_links_from_file(filename, link_type, file_type, config_file, is_to_print_time, is_to_add_several_lexemes, is_dry_run, revset_id=None, comment=""):
    start = datetime.datetime.now()
        
    annotation_editor = AnnotationEditor(config_file)
    link_list = parse_links_from_file(filename, link_type, file_type)

    if comment == "":
        comment = os.path.basename(filename)
    add_links(annotation_editor, link_list, revset_id, comment, is_to_add_several_lexemes)

    if not is_dry_run:
        annotation_editor.commit()
    
    if is_to_print_time:
        print('time elapsed for add_links_from_file:{0}'.format(datetime.datetime.now() - start))
Пример #3
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    only_moderated = False
    if len(sys.argv) < 2 or sys.argv[1] not in ['simple', 'complex', 'both']:
        sys.stderr.write(
            """Usage: {0} {{simple|complex|both}} [mod]\n\tmod: export only moderators' groups, otherwise first user's annotation for each text\n"""
            .format(sys.argv[0]))
        sys.exit(1)
    if len(sys.argv) > 2 and sys.argv[2] == 'mod':
        only_moderated = True
    do_export(editor.db_cursor, sys.argv[1], only_moderated)
Пример #4
0
def find_adv_comp_pairs(config_file):
    adj_pairs = []
    
    annotation_editor = AnnotationEditor(config_file)
    res = annotation_editor.find_lexeme_by_lemma(u'%' + ENDING_COMPARATIVE, [COMPARATIVE_GLOSS], lemma_is_regex=True)
    
    for lexeme in res:
        lexeme_text = lexeme.lemma['text'].decode('utf-8')
        adv_stem = get_adverb_stem(lexeme_text)
        
        adverbs_positive_forms =  annotation_editor.find_lexeme_by_lemma(adv_stem, [ADVERB_GLOSS])
        
        
        for adverbs_positive_form in adverbs_positive_forms:
            adverb_positive_lexeme_id = adverbs_positive_form._id
            adverb_positive_lemma = adverbs_positive_form.lemma['text'].decode('utf-8')
            
            adj_pairs.append(({'id':lexeme._id, 'text':lexeme_text}, {'id':adverb_positive_lexeme_id, 'text':adverb_positive_lemma}))     
            
    return adj_pairs 
Пример #5
0
# -*- coding: utf-8 -*-

import sys

sys.path.append("/corpus/python")
from Annotation import AnnotationEditor

editor = AnnotationEditor("config.ini")


editor.db_cursor.execute("SHOW columns FROM anaphora_syntax_groups")
has_marks = False
rows = editor.db_cursor.fetchall()


# добавлять колонку "marks" только если её не существует
for row in rows:
    if row["Field"] == "marks":
        has_marks = True
        break

if not has_marks:
    editor.db_cursor.execute("ALTER TABLE anaphora_syntax_groups ADD marks ENUM('bad', 'suspicious', 'no head', 'all')")


""" ==============================
        ПРОСТЫЕ ГРУППЫ
    ============================== """


# убрать вершины, добавить тэг "нет вершины" в вводных выражениях, сложных союзах, сложных предлогах, наречных выражениях
Пример #6
0
# -*- coding: utf-8 -*-

import sys
sys.path.append('/corpus/python')
from Annotation import AnnotationEditor
editor = AnnotationEditor('config.ini')


editor.db_cursor.execute("SHOW columns FROM anaphora_syntax_groups")
has_marks = False
rows = editor.db_cursor.fetchall()


# добавлять колонку "marks" только если её не существует
for row in rows:
    if row['Field'] == 'marks':
        has_marks = True
        break

if not has_marks:
    editor.db_cursor.execute("ALTER TABLE anaphora_syntax_groups ADD marks ENUM('bad', 'suspicious', 'no head', 'all')")


""" ==============================
        ПРОСТЫЕ ГРУППЫ
    ============================== """


# убрать вершины, добавить тэг "нет вершины" в вводных выражениях, сложных союзах, сложных предлогах, наречных выражениях
editor.db_cursor.execute("SELECT anaphora_syntax_groups_simple.group_id, group_type FROM anaphora_syntax_groups_simple INNER JOIN anaphora_syntax_groups ON anaphora_syntax_groups_simple.group_id = anaphora_syntax_groups.group_id WHERE group_type IN (4, 5, 6, 7)")
Пример #7
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    do_export(editor.db_cursor)
Пример #8
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    update_annotation(editor)

    if 'debug' not in sys.argv:
        editor.commit()
Пример #9
0
def main():
    editor = AnnotationEditor('/corpus/config.ini')
    process(sys.argv[1], editor)
    editor.commit()
Пример #10
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    merge(editor.db_cursor, sys.argv[1], sys.argv[2])
    editor.commit()
Пример #11
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    dbh = editor.db_cursor

    # all entity entries
    dbh.execute("""
    SELECT ne_entities.*, user_id, sent_id, GROUP_CONCAT(tag_id ORDER BY tag_id SEPARATOR ' ') AS tags
        FROM ne_entities 
            LEFT JOIN ne_paragraphs USING (annot_id)
            LEFT JOIN tokens ON start_token = tf_id
            LEFT JOIN ne_entity_tags USING (entity_id)
        GROUP BY entity_id
        ORDER BY sent_id, start_token, updated_ts
    """)
    results = dbh.fetchall()
    user_ids = sent_ids = set()
    for row in results:
        user_ids.add(str(row['user_id']))
        sent_ids.add(str(row['sent_id']))

    # collect users separately
    dbh.execute("""
    SELECT user_id, user_name, user_shown_name
        FROM users
        WHERE user_id IN ({0})
    """.format(", ".join(user_ids)))
    users_res = dbh.fetchall()
    users = {}
    for user in users_res:
        if len(user["user_shown_name"]) > 0:
            user_name = user["user_shown_name"]
        else:
            user_name = user["user_name"]
        users[user["user_id"]] = user_name

    # collect all tokens from required sentences
    dbh.execute("""
    SELECT *
        FROM tokens
        WHERE sent_id IN ({0})
        ORDER BY sent_id, pos
    """.format(", ".join(sent_ids)))
    sent_res = dbh.fetchall()
    sentences = {}
    for sent in sent_res:
        if sent["sent_id"] not in sentences:
            sentences[sent["sent_id"]] = []
        sentences[sent["sent_id"]].append(sent)

    # collect tag names
    dbh.execute("SELECT * FROM ne_tags")
    tags_res = dbh.fetchall()
    tags = {}
    for tag in tags_res:
        tags[tag["tag_id"]] = tag["tag_name"]

    # output
    for row in results:
        out = ""
        out += str(row["entity_id"]) + "\t"
        out += str(row["sent_id"]) + "\t"
        out += datetime.fromtimestamp(
            row["updated_ts"]).strftime("%b %d, %H:%M") + "\t"
        out += users[row["user_id"]] + "\t"
        for tag in row["tags"].split():
            out += tags[int(tag)] + " "
        out += "\t"
        tokens_all = tokens = ""
        ne_len = 0
        for tkn in sentences[row["sent_id"]]:
            txt = tkn["tf_text"] + " "
            # these are ne tokens
            if tkn["tf_id"] == row["start_token"] or (ne_len > 0 and
                                                      ne_len < row["length"]):
                tokens += txt
                ne_len += 1
            # all tokens for context
            tokens_all += txt
        out += tokens + "\t"
        out += tokens_all + "\t"
        print out.encode('UTF-8')

    if 'debug' not in sys.argv:
        editor.commit()
Пример #12
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    dbh = editor.db_cursor

    # all entity entries
    dbh.execute("""
    SELECT ne_entities.*, user_id, sent_id, GROUP_CONCAT(tag_id ORDER BY tag_id SEPARATOR ' ') AS tags
        FROM ne_entities 
            LEFT JOIN ne_paragraphs USING (annot_id)
            LEFT JOIN tokens ON start_token = tf_id
            LEFT JOIN ne_entity_tags USING (entity_id)
        GROUP BY entity_id
        ORDER BY sent_id, start_token, updated_ts
    """)
    results = dbh.fetchall()
    user_ids = sent_ids = set()
    for row in results:
        user_ids.add(str(row['user_id']))
        sent_ids.add(str(row['sent_id']))

    # collect users separately
    dbh.execute("""
    SELECT user_id, user_name, user_shown_name
        FROM users
        WHERE user_id IN ({0})
    """.format(", ".join(user_ids)))
    users_res = dbh.fetchall()
    users = {}
    for user in users_res:
        if len(user["user_shown_name"]) > 0:
            user_name = user["user_shown_name"]
        else:
            user_name = user["user_name"]
        users[user["user_id"]] = user_name

    # collect all tokens from required sentences
    dbh.execute("""
    SELECT *
        FROM tokens
        WHERE sent_id IN ({0})
        ORDER BY sent_id, pos
    """.format(", ".join(sent_ids)))
    sent_res = dbh.fetchall()
    sentences = {}
    for sent in sent_res:
        if sent["sent_id"] not in sentences:
            sentences[sent["sent_id"]] = []
        sentences[sent["sent_id"]].append(sent)

    # collect tag names
    dbh.execute("SELECT * FROM ne_tags")
    tags_res = dbh.fetchall()
    tags = {}
    for tag in tags_res:
        tags[tag["tag_id"]] = tag["tag_name"]

    # output
    for row in results:
        out = ""
        out += str(row["entity_id"]) + "\t"
        out += str(row["sent_id"]) + "\t"
        out += datetime.fromtimestamp(row["updated_ts"]).strftime("%b %d, %H:%M") + "\t"
        out += users[row["user_id"]] + "\t"
        for tag in row["tags"].split():
            out += tags[int(tag)] + " "
        out += "\t"
        tokens_all = tokens = ""
        ne_len = 0
        for tkn in sentences[row["sent_id"]]:
            txt = tkn["tf_text"] + " "
            # these are ne tokens
            if tkn["tf_id"] == row["start_token"] or (ne_len > 0 and ne_len < row["length"]):
                tokens += txt
                ne_len+=1
            # all tokens for context
            tokens_all += txt
        out += tokens + "\t"
        out += tokens_all + "\t"
        print out.encode('UTF-8')

    if 'debug' not in sys.argv:
        editor.commit()
Пример #13
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    do_import(editor, sys.stdin)
Пример #14
0
def main():
    editor = AnnotationEditor(CONFIG_PATH)
    merge(editor.db_cursor, sys.argv[1], sys.argv[2])
    editor.commit()