def get_headers_for_document_id(document_id, document_text=None): if document_text is None: ecco_api_client = OctavoEccoClient() document_data = ecco_api_client.get_text_for_document_id(document_id) document_text = document_data.get('text') headerdata = get_headers_from_document_text(document_text) return headerdata
def set_octavo_indices(self): ecco_api_client = OctavoEccoClient() eebo_api_client = OctavoEeboClient() for document_id, fragment_list in self.fragments_by_ecco_id.items(): # TODO: handle the notes somehow! if "_note_" in str(document_id): textdata = None elif len(document_id) < 10: textdata = eebo_api_client.get_text_for_document_id( document_id).get('text') else: textdata = ecco_api_client.get_text_for_document_id( document_id).get('text') for fragment in fragment_list: fragment.set_octavo_index(textdata)
from lib.octavo_api_client import ( OctavoEccoClient, OctavoEccoClusterClient ) from lib.fragmentlists import ( get_fragmentlist, get_doctext_indexmap, test_fragment_text) from lib.utils_common import create_dir_if_not_exists from lib.headerdata_dump_common import read_docid_asciimap_csv ecco_api_client = OctavoEccoClient() cluster_api_client = OctavoEccoClusterClient(timeout=600) docids_asciimap = read_docid_asciimap_csv('data/eccoids/asciilines.csv') fields_ecco = ["documentID", "content"] field_eccocluster = ["documentID", "fragmentID", "text", "startIndex", "endIndex"] docid_to_process = "0162900301" docid_clusterdata = ( cluster_api_client.get_cluster_data_for_document_id( docid_to_process, fields=field_eccocluster))
def get_datadir(): if len(sys.argv) == 1: sys.exit("Provide datadir.") elif len(sys.argv) == 2: return sys.argv[1] else: sys.exit("Too many command line args.") # --------------------------- # main script # --------------------------- ecco_api_client = OctavoEccoClient() ecco_api_client = "local" cluster_api_client = OctavoEccoClusterClient() cluster_api_client = None # docids_asciimap = read_docid_asciimap_csv('data/eccoids/asciilines.csv') xml_img_page_datadir = ("../data/raw/ecco-xml-img/") fields_ecco = ["documentID", "content"] field_eccocluster = [ "documentID", "fragmentID", "text", "startIndex", "endIndex" ] datadir = get_datadir() + "/" # reuse data list of JSON files
# # "../data/work/hackathon/", # # "../data/work/hume_full/", # ] # all_data = [] # for path in jsonpaths: # jsonfiles = get_datafiles(path) # for jsonfileloc in jsonfiles: # with open(jsonfileloc, 'r') as jsonfile: # jsondata = json.load(jsonfile) # all_data.extend(jsondata) # xxxx ecco_api_client = OctavoEccoClient() eebo_api_client = OctavoEeboClient() char_offsets = {} text_ids = [] for item in galeitems: text_ids.append(item['document_id']) # text_ids.append(item['id_secondary']) # text_ids = text_ids[:50] # text_ids = ["A56206.headed_2_text", "A56206.headed_1_text"] # text_ids = ['0081400111'] # text_ids = ["A65112.headed_2_text"] # text_ids = ["A56206.headed_2_text"] # text_ids = ['0818700401'] # text_ids = ["A90295.headed_1_text"]
get_headers_from_document_text) from lib.author_metadata import read_author_metadata_csv from lib.text_reuse_common import (load_good_metadata) def read_txt_file_to_string(file_path): with open(file_path, 'r') as txtfile: str_data = txtfile.read() return str_data # document_id = document_id_dict.get('id') # get doc from api # api_limit = -1 ecco_api_client = OctavoEccoClient() cluster_api_client = OctavoEccoClusterClient(limit=-1, timeout=60) document_id = "0175300500" document_text = ecco_api_client.get_text_for_document_id(document_id)['text'] # document_text = document_data.get('text') document_meta = ecco_api_client.get_document_id_metadata(document_id) # test_text_loc = "/media/vvaara/uh-villevaara-ext1/eccotxt/ECCO_I/ECCO_2of2/RelAndPhil/0010800104/xml/0010800104.txt" # test_text = read_txt_file_to_string(test_text_loc) headerdata = get_headers_for_document_id(document_id, document_text) print("> Fetching clusterIDs ...") cluster_ids = cluster_api_client.get_cluster_ids_list_for_document_id( document_id) print(" >> Done!")
# page_snip_start = snip_start # else: # page_snip_start = page_first_char_index # page_snip = fulltext[page_snip_start:(page_snip_end + 1)] # snip_page_dict[page_number] = { # 'snip_start': page_snip_start, # 'snip_end': page_snip_end, # 'snip_text': page_snip # } # return snip_page_dict # transfer >>> # estc T082481 cluster_api_client = OctavoEccoClusterClient(timeout=600) ecco_api_client = OctavoEccoClient() docids_asciimap = read_docid_asciimap_csv('data/eccoids/asciilines.csv') fields_ecco = ["documentID", "content"] field_eccocluster = [ "documentID", "fragmentID", "text", "startIndex", "endIndex" ] docid_text = ecco_api_client.get_text_for_document_id('0162200200').get('text') from lib.octavo_api_client import (OctavoEccoClient, OctavoEccoClusterClient) from lib.tr_bookcontainer import BookContainer ecco_api_client = OctavoEccoClient() humebook = BookContainer(
# find_ecco_id.py from lib.octavo_api_client import ( OctavoEccoClient, ) def print_response(responsedata): if len(responsedata) == 0: print("\nNo match\n") else: for item in responsedata: print() for key, value in item.items(): print(key + ": " + str(value)) print() ecco_api_client = OctavoEccoClient() print_response(ecco_api_client.get_estc_id_metadata("R223440"))
all_outpaths = [] documents_meta_dict = {} for document_id_dict in document_ids: document_id = document_id_dict.get('id') if (document_id_dict.get('filter_out_year_above') != -1): filter_out_year_above = document_id_dict.get('filter_out_year_above') if (document_id_dict.get('filter_out_year_below') != -1 and document_id_dict.get('filter_out_year_below') is not None): filter_out_year_below = document_id_dict.get('filter_out_year_below') outpath_prefix = outpath_prefix_base + "/" + document_id all_outpaths.append(get_outpath_prefix_with_date(outpath_prefix)) # get doc from api ecco_api_client = OctavoEccoClient() cluster_api_client = OctavoEccoClusterClient(limit=api_limit, timeout=60) document_data = ecco_api_client.get_text_for_document_id(document_id) document_text = document_data.get('text') documents_meta_dict[document_id] = { 'id': document_id, 'length': len(document_text), 'sequence': document_id_dict.get('sequence'), 'description': document_id_dict.get('description') } headerdata = get_headers_for_document_id(document_id, document_text) print("> Fetching clusterIDs ...") cluster_ids = cluster_api_client.get_cluster_ids_list_for_document_id( document_id)