doc = etk.create_document(sample_html, mime_type="text/html", url="http://ex.com/123") my_table_extractor = TableExtractor() d = doc.select_segments("$.raw_content")[0] root = doc.select_segments("$")[0] tables = doc.invoke_extractor(my_table_extractor, d) for t in tables: root.store_extractions([t], t.tag, group_by_tags=False) table_data_extractor = EntityTableDataExtraction() table_data_extractor.add_glossary( etk.load_glossary("./resources/address_dict.txt"), "address") table_data_extractor.add_glossary( etk.load_glossary("./resources/calibre_dict.txt"), "caliber") table_data_extractor.add_glossary( etk.load_glossary("./resources/capacity_dict.txt"), "capacity") table_data_extractor.add_glossary( etk.load_glossary("./resources/manufacturer_dict.txt"), "manufacturer") table_data_extractor.add_glossary( etk.load_glossary("./resources/price_dict.txt"), "price") tables = doc.select_segments("$.tables[*]") root = doc.select_segments("$")[0] for t in tables: extractions = doc.invoke_extractor(table_data_extractor, t) root.store_extractions(extractions, "table_data_extraction")
"description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students." } ], "members": [ { "name": "Dongyu Li", "description": "03/05/2018: I went to USC on Aug 20th, 2016 and will graduate on 2018, May 11. My birthday is 29-04-1994." } ] } etk = ETK() doc = etk.create_document(sample_input) # example for glossary extractor: name_extractor = GlossaryExtractor(etk.load_glossary("./names.txt"), "name_extractor", etk.default_tokenizer, case_sensitive=False, ngrams=1) descriptions = doc.select_segments("projects[*].description") projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): print ("Iam d path: " + d.full_path) names = doc.invoke_extractor(name_extractor, d) p.store_extractions(names, "members") # example for date extractor: date_extractor = DateExtractor('test_date_parser') member_descriptions = doc.select_segments("members[*].description") members = doc.select_segments("members[*]") for m_d, m in zip(member_descriptions, members):