Python georesolve_page_2 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: defoe.nls.query_utils

메소드/함수: georesolve_page_2

hotexamples.com에서의 예제들: 4

Python georesolve_page_2 - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 defoe.nls.query_utils.georesolve_page_2에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def do_query(df, config_file=None, logger=None, context=None):
    """
    Ingest NLS pages, applies scpaCy NLP pipeline for identifying the possible locations of each page. And applies the edinburgh geoparser for getting the latituted and longitude of each of them.
    Before applying the geoparser, two clean steps are applied - long-S and hyphen words. 
    
    Example:

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: 
    :rtype: string
    """
    #with open(config_file, "r") as f:
    #    config = yaml.load(f)

    #year = config["year"]

    lang_model = "en_core_web_lg"
    fdf = df.withColumn("source_text_clean",
                        blank_as_null("source_text_clean"))

    #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").filter(df["year"]==year).filter(df["archive_filename"]=="/home/tdm/datasets/nls-data-gazetteersOfScotland/97376462").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean)

    #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").filter(df["year"]=="1883").filter(df["edition"]=="1884-1885, Volume 3").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean)

    newdf = fdf.filter(fdf.source_text_clean.isNotNull()).filter(
        fdf["model"] == "nls").filter(df["year"] == "1828").select(
            fdf.year, fdf.title, fdf.edition, fdf.archive_filename,
            fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean)

    pages = newdf.rdd.map(tuple)
    matching_pages = pages.map(lambda geo_page: (
        geo_page[0], {
            "title": geo_page[1],
            "edition": geo_page[2],
            "archive": geo_page[3],
            "page_filename": geo_page[4],
            "text_unit id": geo_page[5],
            "lang_model": lang_model,
            "georesolution_page": georesolve_page_2(geo_page[6], lang_model)
        }))

    result = matching_pages \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result

예제 #2

파일 보기

파일: georesolution_pages.py 프로젝트: alan-turing-institute/defoe

def do_query(archives, config_file=None, logger=None, context=None):
    """
    Ingest NLS pages, applies scpaCy NLP pipeline for identifying the possible locations of each page. And applies the edinburgh geoparser for getting the latituted and longitude of each of them.
    Before applying the geoparser, two clean steps are applied - long-S and hyphen words. 
    
    Example:

    ("Descriptive account of the principal towns in Scotland: to accompany Wood's town atlas", '1828', 1828, 'Edinburgh', '/home/tdm/datasets/nls-data-gazetteersOfScotland/97350713', 'alto/97350911.34.xml', 'page', 'Page17', 376, 'book', 'nls', 'CONTENTS. Page. Aberdeen, 1 Annan, 19 Arbroath, 23 Ayr, .--SO Banff, 39 Berwick, 4S Brechin, 55 Crieff, 61 Cupar Fife, • 65 Dalkeith, 70 Dingwall, 76 DunbartorT, • 79 Dundee, 83 Dumfries, <• 91 Dunfermline, 99 Dunkeid, « 105 Edinburgh, -. . 1 1 1 Elgin, . . . ]29 Forfar, -135 Forres, 139 Glasgow, . 117', {}), ("Descriptive account of the principal towns in Scotland: to accompany Wood's town atlas", '1828', 1828, 'Edinburgh', '/home/tdm/datasets/nls-data-gazetteersOfScotland/97350713', 'alto/97350923.34.xml', 'page', 'Page18', 376, 'book', 'nls', 'Xll Greenock, 171 Haddington, 181 Hamilton, 185 Hawick, 191 Inverary, 199 Inverness, . * •> 203 Irvine, * 211 Jedburgh, * * 215 Kelso, 221 Kilmarnock, • 227 Kirkcaldy 233 Kinross, * * 241 Lanark, * 247 Leith, 253 Linlithgow, «• * 265 Montrose, 271 Nairn, 277 Paisley, 281 Peebles, 291 Perth, * 297 Portobello, 309 Rothesay, * 313 Selkirk, > , 319 St Andrews, 323 Stirling, -^331 Stonehaven, * 339 Stornowav, ... Si-5', {('Hamilton', '1'): ('55.77731433348086', '-4.067392672500774'), ('Inverary', '2'): ('56.2333333', '-5.0666667'), ('Inverness', '3'): ('57.47871409771949', '-4.212450527351024'), ('Lanark', '4'): ('55.67483195471274', '-3.775417694605498')}),



    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: "0"
    :rtype: string
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)

    lang_model = config["lang_model"]
    documents = archives.flatMap(
        lambda archive: [(document.year, document.title, document.edition, \
                          document.archive.filename, document) for document in list(archive)])

    pages_clean = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[1], year_document[2],\
                                year_document[3], page.code, page.page_id, clean_page_as_string(page)) for page in year_document[4]])

    matching_pages = pages_clean.map(lambda geo_page: (
        geo_page[0], {
            "title": geo_page[1],
            "edition": geo_page[2],
            "archive": geo_page[3],
            "page_filename": geo_page[4],
            "text_unit id": geo_page[5],
            "lang_model": lang_model,
            "georesolution_page": georesolve_page_2(geo_page[6], lang_model)
        }))

    result = matching_pages \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result

예제 #3

파일 보기

def do_query(archives, config_file=None, logger=None, context=None):
    """
    It ingest NLS pages, applies scpaCy NLP pipeline for identifying the possible locations of each page. 
    And it applies the edinburgh geoparser (just the georesolver) for getting the latituted and longitude of each of them.
    
    Before applying the spaCy NLP, two clean steps are applied - long-S and hyphen words. 
    
    A config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    A config_file should be indicated to specify the lang_model, gazetteer to use, 
    the defoe_path, the bounding box (optional), as well as the operating system. 
    
    Example:
      - 1842:
        - archive: /home/rosa_filgueira_vicente/datasets/sg_simple_sample/97437554
        - edition: 1842, Volume 1
        - georesolution_page:
            - Aberdeenshire-19:
              - in-cc: ''
              - lat: '57.21923117162595'
              - long: '-2.801013003249016'
              - pop: ''
              - snippet: 'BUCHAN , a district of Aberdeenshire , extending along the coast '
              - type: civila
            - Cumberland-12:
              - in-cc: ''
              - lat: '51.4342921249674'
              - long: '-0.6131610294930387'
              - pop: ''
              - snippet: 'all the low country of Cumberland lies full before you , '
              - type: fac
             ....
        - lang_model: en_core_web_lg
        - page_filename: alto/97440572.34.xml
        - text_unit id: Page252
        - title: topographical, statistical, and historical gazetteer of Scotland
    

    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: "0"
    :rtype: string
    """
    with open(config_file, "r") as f:
        config = yaml.load(f)
    
    lang_model = config["lang_model"]
    gazetteer = config["gazetteer"]
    if "bounding_box" in config:
        bounding_box = " -lb " + config["bounding_box"] + " 2"
    else:
        bounding_box = ""
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type= "sys-i386-snow-leopard"
    else:
            os_type = "sys-i386-64"
    if "defoe_path" in config :
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"
    documents = archives.flatMap(
        lambda archive: [(document.year, document.title, document.edition, \
                          document.archive.filename, document) for document in list(archive)])
    
    pages_clean = documents.flatMap(
        lambda year_document: [(year_document[0], year_document[1], year_document[2],\
                                year_document[3], page.code, page.page_id, clean_page_as_string(page,defoe_path, os_type)) for page in year_document[4]])

    matching_pages = pages_clean.map(
        lambda geo_page:
        (geo_page[0],
         {"title": geo_page[1],
          "edition": geo_page[2],
          "archive": geo_page[3], 
          "page_filename": geo_page[4],
          "text_unit id": geo_page[5],
          "lang_model": lang_model, 
          "georesolution_page": georesolve_page_2(geo_page[6],lang_model, defoe_path, gazetteer, bounding_box)}))
    
    result = matching_pages \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result

예제 #4

파일 보기

파일: georesolution_pages.py 프로젝트: akrause2014/defoe-1

def do_query(df, config_file=None, logger=None, context=None):
    """
    Retrieves NLS pages from ES, which have been previously clean and stored. 
    Applies scpaCy NLP pipeline for identifying the possible locations of each page. And applies the edinburgh geoparser for getting the latituted and longitude of each of them.
    
    A config_file must be the path to a lexicon file with a list of the keywords 
    to search for, one per line.
    
    A config_file should be indicated to specify the lang_model, gazetteer to use, 
    the defoe_path, the bounding box (optional), as well as the operating system. 
    
    Example:
      - 1842:
        - archive: /home/rosa_filgueira_vicente/datasets/sg_simple_sample/97437554
        - edition: 1842, Volume 1
        - georesolution_page:
            - Aberdeenshire-19:
              - in-cc: ''
              - lat: '57.21923117162595'
              - long: '-2.801013003249016'
              - pop: ''
              - snippet: 'BUCHAN , a district of Aberdeenshire , extending along the coast '
              - type: civila
            - Cumberland-12:
              - in-cc: ''
              - lat: '51.4342921249674'
              - long: '-0.6131610294930387'
              - pop: ''
              - snippet: 'all the low country of Cumberland lies full before you , '
              - type: fac
             ....
        - lang_model: en_core_web_lg
        - page_filename: alto/97440572.34.xml
    
    :param archives: RDD of defoe.nls.archive.Archive
    :type archives: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: 
    :rtype: string
    """
   
    with open(config_file, "r") as f:
        config = yaml.load(f)

    lang_model = config["lang_model"]
    gazetteer = config["gazetteer"]
    if "bounding_box" in config:
        bounding_box = " -lb " + config["bounding_box"] + " 2"
    else:
        bounding_box = ""
    if "os_type" in config:
        if config["os_type"] == "linux":
            os_type = "sys-i386-64"
        else:
            os_type= "sys-i386-snow-leopard"
    else:
            os_type = "sys-i386-64"
    if "defoe_path" in config :
        defoe_path = config["defoe_path"]
    else:
        defoe_path = "./"

    fdf = df.withColumn("source_text_clean", blank_as_null("source_text_clean"))

    #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").filter(df["year"]==year).filter(df["archive_filename"]=="/home/tdm/datasets/nls-data-gazetteersOfScotland/97376462").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean)
    
    #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").filter(df["year"]=="1883").filter(df["edition"]=="1884-1885, Volume 3").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean)

    newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").filter(df["year"]=="1828").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean)

    pages=newdf.rdd.map(tuple)
    matching_pages = pages_clean.map(
        lambda geo_page:
        (geo_page[0],
         {"title": geo_page[1],
          "edition": geo_page[2],
          "archive": geo_page[3], 
          "page_filename": geo_page[4],
          "text_unit id": geo_page[5],
          "lang_model": lang_model, 
          "georesolution_page": georesolve_page_2(geo_page[6],lang_model, defoe_path, gazetteer, bounding_box)}))
    
    result = matching_pages \
        .groupByKey() \
        .map(lambda date_context:
             (date_context[0], list(date_context[1]))) \
        .collect()
    return result