def do_query(df, config_file=None, logger=None, context=None): """ Ingest NLS pages, applies scpaCy NLP pipeline for identifying the possible locations of each page. And applies the edinburgh geoparser for getting the latituted and longitude of each of them. Before applying the geoparser, two clean steps are applied - long-S and hyphen words. Example: :param archives: RDD of defoe.nls.archive.Archive :type archives: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: :rtype: string """ #with open(config_file, "r") as f: # config = yaml.load(f) #year = config["year"] lang_model = "en_core_web_lg" fdf = df.withColumn("source_text_clean", blank_as_null("source_text_clean")) #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").filter(df["year"]==year).filter(df["archive_filename"]=="/home/tdm/datasets/nls-data-gazetteersOfScotland/97376462").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean) #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").filter(df["year"]=="1883").filter(df["edition"]=="1884-1885, Volume 3").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean) newdf = fdf.filter(fdf.source_text_clean.isNotNull()).filter( fdf["model"] == "nls").filter(df["year"] == "1828").select( fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean) pages = newdf.rdd.map(tuple) matching_pages = pages.map(lambda geo_page: ( geo_page[0], { "title": geo_page[1], "edition": geo_page[2], "archive": geo_page[3], "page_filename": geo_page[4], "text_unit id": geo_page[5], "lang_model": lang_model, "georesolution_page": georesolve_page_2(geo_page[6], lang_model) })) result = matching_pages \ .groupByKey() \ .map(lambda date_context: (date_context[0], list(date_context[1]))) \ .collect() return result
def do_query(archives, config_file=None, logger=None, context=None): """ Ingest NLS pages, applies scpaCy NLP pipeline for identifying the possible locations of each page. And applies the edinburgh geoparser for getting the latituted and longitude of each of them. Before applying the geoparser, two clean steps are applied - long-S and hyphen words. Example: ("Descriptive account of the principal towns in Scotland: to accompany Wood's town atlas", '1828', 1828, 'Edinburgh', '/home/tdm/datasets/nls-data-gazetteersOfScotland/97350713', 'alto/97350911.34.xml', 'page', 'Page17', 376, 'book', 'nls', 'CONTENTS. Page. Aberdeen, 1 Annan, 19 Arbroath, 23 Ayr, .--SO Banff, 39 Berwick, 4S Brechin, 55 Crieff, 61 Cupar Fife, • 65 Dalkeith, 70 Dingwall, 76 DunbartorT, • 79 Dundee, 83 Dumfries, <• 91 Dunfermline, 99 Dunkeid, « 105 Edinburgh, -. . 1 1 1 Elgin, . . . ]29 Forfar, -135 Forres, 139 Glasgow, . 117', {}), ("Descriptive account of the principal towns in Scotland: to accompany Wood's town atlas", '1828', 1828, 'Edinburgh', '/home/tdm/datasets/nls-data-gazetteersOfScotland/97350713', 'alto/97350923.34.xml', 'page', 'Page18', 376, 'book', 'nls', 'Xll Greenock, 171 Haddington, 181 Hamilton, 185 Hawick, 191 Inverary, 199 Inverness, . * •> 203 Irvine, * 211 Jedburgh, * * 215 Kelso, 221 Kilmarnock, • 227 Kirkcaldy 233 Kinross, * * 241 Lanark, * 247 Leith, 253 Linlithgow, «• * 265 Montrose, 271 Nairn, 277 Paisley, 281 Peebles, 291 Perth, * 297 Portobello, 309 Rothesay, * 313 Selkirk, > , 319 St Andrews, 323 Stirling, -^331 Stonehaven, * 339 Stornowav, ... Si-5', {('Hamilton', '1'): ('55.77731433348086', '-4.067392672500774'), ('Inverary', '2'): ('56.2333333', '-5.0666667'), ('Inverness', '3'): ('57.47871409771949', '-4.212450527351024'), ('Lanark', '4'): ('55.67483195471274', '-3.775417694605498')}), :param archives: RDD of defoe.nls.archive.Archive :type archives: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: "0" :rtype: string """ with open(config_file, "r") as f: config = yaml.load(f) lang_model = config["lang_model"] documents = archives.flatMap( lambda archive: [(document.year, document.title, document.edition, \ document.archive.filename, document) for document in list(archive)]) pages_clean = documents.flatMap( lambda year_document: [(year_document[0], year_document[1], year_document[2],\ year_document[3], page.code, page.page_id, clean_page_as_string(page)) for page in year_document[4]]) matching_pages = pages_clean.map(lambda geo_page: ( geo_page[0], { "title": geo_page[1], "edition": geo_page[2], "archive": geo_page[3], "page_filename": geo_page[4], "text_unit id": geo_page[5], "lang_model": lang_model, "georesolution_page": georesolve_page_2(geo_page[6], lang_model) })) result = matching_pages \ .groupByKey() \ .map(lambda date_context: (date_context[0], list(date_context[1]))) \ .collect() return result
def do_query(archives, config_file=None, logger=None, context=None): """ It ingest NLS pages, applies scpaCy NLP pipeline for identifying the possible locations of each page. And it applies the edinburgh geoparser (just the georesolver) for getting the latituted and longitude of each of them. Before applying the spaCy NLP, two clean steps are applied - long-S and hyphen words. A config_file must be the path to a lexicon file with a list of the keywords to search for, one per line. A config_file should be indicated to specify the lang_model, gazetteer to use, the defoe_path, the bounding box (optional), as well as the operating system. Example: - 1842: - archive: /home/rosa_filgueira_vicente/datasets/sg_simple_sample/97437554 - edition: 1842, Volume 1 - georesolution_page: - Aberdeenshire-19: - in-cc: '' - lat: '57.21923117162595' - long: '-2.801013003249016' - pop: '' - snippet: 'BUCHAN , a district of Aberdeenshire , extending along the coast ' - type: civila - Cumberland-12: - in-cc: '' - lat: '51.4342921249674' - long: '-0.6131610294930387' - pop: '' - snippet: 'all the low country of Cumberland lies full before you , ' - type: fac .... - lang_model: en_core_web_lg - page_filename: alto/97440572.34.xml - text_unit id: Page252 - title: topographical, statistical, and historical gazetteer of Scotland :param archives: RDD of defoe.nls.archive.Archive :type archives: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: "0" :rtype: string """ with open(config_file, "r") as f: config = yaml.load(f) lang_model = config["lang_model"] gazetteer = config["gazetteer"] if "bounding_box" in config: bounding_box = " -lb " + config["bounding_box"] + " 2" else: bounding_box = "" if "os_type" in config: if config["os_type"] == "linux": os_type = "sys-i386-64" else: os_type= "sys-i386-snow-leopard" else: os_type = "sys-i386-64" if "defoe_path" in config : defoe_path = config["defoe_path"] else: defoe_path = "./" documents = archives.flatMap( lambda archive: [(document.year, document.title, document.edition, \ document.archive.filename, document) for document in list(archive)]) pages_clean = documents.flatMap( lambda year_document: [(year_document[0], year_document[1], year_document[2],\ year_document[3], page.code, page.page_id, clean_page_as_string(page,defoe_path, os_type)) for page in year_document[4]]) matching_pages = pages_clean.map( lambda geo_page: (geo_page[0], {"title": geo_page[1], "edition": geo_page[2], "archive": geo_page[3], "page_filename": geo_page[4], "text_unit id": geo_page[5], "lang_model": lang_model, "georesolution_page": georesolve_page_2(geo_page[6],lang_model, defoe_path, gazetteer, bounding_box)})) result = matching_pages \ .groupByKey() \ .map(lambda date_context: (date_context[0], list(date_context[1]))) \ .collect() return result
def do_query(df, config_file=None, logger=None, context=None): """ Retrieves NLS pages from ES, which have been previously clean and stored. Applies scpaCy NLP pipeline for identifying the possible locations of each page. And applies the edinburgh geoparser for getting the latituted and longitude of each of them. A config_file must be the path to a lexicon file with a list of the keywords to search for, one per line. A config_file should be indicated to specify the lang_model, gazetteer to use, the defoe_path, the bounding box (optional), as well as the operating system. Example: - 1842: - archive: /home/rosa_filgueira_vicente/datasets/sg_simple_sample/97437554 - edition: 1842, Volume 1 - georesolution_page: - Aberdeenshire-19: - in-cc: '' - lat: '57.21923117162595' - long: '-2.801013003249016' - pop: '' - snippet: 'BUCHAN , a district of Aberdeenshire , extending along the coast ' - type: civila - Cumberland-12: - in-cc: '' - lat: '51.4342921249674' - long: '-0.6131610294930387' - pop: '' - snippet: 'all the low country of Cumberland lies full before you , ' - type: fac .... - lang_model: en_core_web_lg - page_filename: alto/97440572.34.xml :param archives: RDD of defoe.nls.archive.Archive :type archives: pyspark.rdd.PipelinedRDD :param config_file: query configuration file :type config_file: str or unicode :param logger: logger (unused) :type logger: py4j.java_gateway.JavaObject :return: :rtype: string """ with open(config_file, "r") as f: config = yaml.load(f) lang_model = config["lang_model"] gazetteer = config["gazetteer"] if "bounding_box" in config: bounding_box = " -lb " + config["bounding_box"] + " 2" else: bounding_box = "" if "os_type" in config: if config["os_type"] == "linux": os_type = "sys-i386-64" else: os_type= "sys-i386-snow-leopard" else: os_type = "sys-i386-64" if "defoe_path" in config : defoe_path = config["defoe_path"] else: defoe_path = "./" fdf = df.withColumn("source_text_clean", blank_as_null("source_text_clean")) #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").filter(df["year"]==year).filter(df["archive_filename"]=="/home/tdm/datasets/nls-data-gazetteersOfScotland/97376462").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean) #newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").filter(df["year"]=="1883").filter(df["edition"]=="1884-1885, Volume 3").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean) newdf=fdf.filter(fdf.source_text_clean.isNotNull()).filter(fdf["model"]=="nls").filter(df["year"]=="1828").select(fdf.year, fdf.title, fdf.edition, fdf.archive_filename, fdf.source_text_filename, fdf.text_unit_id, fdf.source_text_clean) pages=newdf.rdd.map(tuple) matching_pages = pages_clean.map( lambda geo_page: (geo_page[0], {"title": geo_page[1], "edition": geo_page[2], "archive": geo_page[3], "page_filename": geo_page[4], "text_unit id": geo_page[5], "lang_model": lang_model, "georesolution_page": georesolve_page_2(geo_page[6],lang_model, defoe_path, gazetteer, bounding_box)})) result = matching_pages \ .groupByKey() \ .map(lambda date_context: (date_context[0], list(date_context[1]))) \ .collect() return result