Exemplo n.º 1
0
def main():
    load_dotenv(dotenv_path='.env')
    EL_DATABASE_NAME = os.getenv("EL_DBNAME")
    DATABASE_USER = os.getenv("DBUSER")
    DATABASE_PASSWORD = os.getenv("DBPASS")
    DATABASE_HOST = os.getenv("DBHOST")
    connection = pymysql.connect(host=DATABASE_HOST,
                                 user=DATABASE_USER,
                                 password=DATABASE_PASSWORD,
                                 db=EL_DATABASE_NAME,
                                 charset='utf8mb4',
                                 use_unicode=True,
                                 cursorclass=pymysql.cursors.DictCursor)
    try:
        with connection.cursor() as pages_cursor:
            pages_cursor.execute("SET NAMES utf8mb4;")
            pages_cursor.execute("SET CHARACTER SET utf8mb4;")
            pages_cursor.execute("SET character_set_connection=utf8mb4;")
            with connection.cursor() as mentions_cursor:
                mentions_cursor.execute("SET NAMES utf8mb4;")
                mentions_cursor.execute("SET CHARACTER SET utf8mb4;")
                mentions_cursor.execute(
                    "SET character_set_connection=utf8mb4;")
                pages, page_count = get_nondisambiguation_pages_having_mentions(
                    pages_cursor)
                for page in progressbar(pages, max_value=page_count):
                    page_id = page['id']
                    sorted_mentions = get_page_mentions_by_entity(
                        mentions_cursor, page_id)
                    mention_link_titles = _.pluck(sorted_mentions, 'entity')
                    page_iobes = get_page_iobes(page, sorted_mentions,
                                                mention_link_titles)
                    write_page_iobes(page, page_iobes)
    finally:
        connection.close()
Exemplo n.º 2
0
def test_get_page_iobes_overlapping_matches():
  page = {'source_id': 0, 'title': 'Other', 'content': 'some other text and my stuff'}
  mentions = [{'text': 'some other text', 'offset': 0, 'page_title': 'Other'},
              {'text': 'my', 'offset': 20, 'page_title': 'My page'}]
  mention_link_titles = ['Other', 'My page']
  assert [[['some', 'Other', 'B'],
           ['other', 'Other', 'I'],
           ['text', 'Other', 'E'],
           ['and', 'O'],
           ['my', 'My%20page', 'S'],
           ['stuff', 'O']]] == iobes.get_page_iobes(page, mentions, mention_link_titles)
Exemplo n.º 3
0
def test_get_page_iobes_straddling_mention():
  page = {'content': '2002–03 NHL. season', 'source_id': 0, 'title': '2002–03 Buffalo Sabres season'}
  page_contexts = {'2002–03 NHL season': [{'text': '2002–03 NHL. season',
                                           'offset': 0,
                                           'page_title': '2002–03 Buffalo Sabres season'}]}
  mentions = list(page_contexts.values())[0]
  mention_link_titles = ['2002–03 NHL season']
  page_iobes = [[['2002–03' , '2002%E2%80%9303%20NHL%20season', 'B'],
                 ['NHL'     , '2002%E2%80%9303%20NHL%20season', 'I'],
                 ['.'     , '2002%E2%80%9303%20NHL%20season', 'I'],
                 ['season'  , '2002%E2%80%9303%20NHL%20season', 'E']]]
  assert page_iobes == iobes.get_page_iobes(page, mentions, mention_link_titles)
Exemplo n.º 4
0
def test_get_page_iobes():
  with open('test/fixtures/parade_page_db.json') as f:
    parade_page = json.load(f)
  with open('test/fixtures/parade_page_contexts.json') as f:
    filter_out_of_bounds = lambda mention: mention['offset'] < len(parade_page['content'])
    parade_page_contexts = _.map_values(json.load(f),
                                        lambda mentions: list(filter(filter_out_of_bounds, mentions)))
  context_pairs = _.mapcat(_.to_pairs(parade_page_contexts),
                           lambda pair: [[pair[0], mention] for mention in pair[1]])
  contexts = _.sort_by(context_pairs,
                       lambda title_mention: title_mention[1]['offset'])
  mentions = _.flat_map(contexts, _.last)
  mention_link_titles = list(map(_.head, contexts))
  assert parade_iobes == iobes.get_page_iobes(parade_page, mentions, mention_link_titles)