def process_spacy_and_convert_to_naf(self, nlp, dct, # in a next iteration, we can make this a class attribute layers, output_path=None): """ process with spacy and convert to NAF :param nlp: spacy language model :param datetime.datetime dct: document creation time :param set layers: layers to convert to NAF, e.g., {'raw', 'text', 'terms'} :param output_path: if provided, NAF is saved to that file :return: the root of the NAF XML object """ root = spacy_to_naf.text_to_NAF(text=self.content, nlp=nlp, dct=dct, layers=layers, title=self.name, uri=self.uri, language=self.language) if output_path is not None: with open(output_path, 'w') as outfile: outfile.write(spacy_to_naf.NAF_to_string(NAF=root)) return root
def text_to_naf(wiki_title, target_languages, text, wiki_uri, annotations, prefix, language, nlp, dct, output_folder=None, wiki_langlinks={}, verbose=0): assert language in target_languages, f'{language} not part of supported languages: {" ".join(target_languages)}' # parse with spaCy add_mw = False if language in {'en', 'nl'}: add_mw = True try: naf = spacy_to_naf.text_to_NAF(text=text, nlp=nlp, dct=dct, layers={'raw', 'text', 'terms', 'deps'}, naf_version='v3.1', title=wiki_title, uri=wiki_uri, language=language, add_mws=add_mw) assert naf.find('raw').text == text, f'mismatch between raw text JSON and NAF file' except: return # add hyperlinks as entity elements add_hyperlinks(naf, annotations, prefix, language, dct, wiki_langlinks=wiki_langlinks) # if wanted, write output to disk if output_folder is not None: if not os.path.exists(output_folder): os.mkdir(output_folder) lang_dir = os.path.join(output_folder, language) if not os.path.exists(lang_dir): os.mkdir(lang_dir) output_path = os.path.join(lang_dir, f'{wiki_title}.naf') spacy_to_naf.NAF_to_file(naf, output_path) if verbose >= 3: print(f'saved to {output_path}') return naf
def fileread(): for filename in os.listdir(path + "rawTest"): if filename.endswith(".txt"): orig = open(path + "rawTest/" + filename, "r", encoding="utf8") f = open(filename.replace(".txt", ".naf"), "w", encoding="utf8") text = "" for line in orig.read(): text += line #datetime.datetime.now() may just be datetime.now() depending on python version NAF = spacy_to_naf.text_to_NAF( text, nlp, dct=datetime.datetime.now(), layers={'raw', 'text', 'terms', 'entities', 'deps', 'chunks'}) f.write(spacy_to_naf.NAF_to_string(NAF)) f.close()
def process_first_x_files(path_signalmedia_json, path_newsreader_nafs='', start=None, end=None): """ create generator of json objects (representing signalmedia articles) :param str path_signalmedia_json: path to all signalmedia article in jsonl (originally called signalmedia-1m.jsonl :param str path_newsreader_nafs: path to where signalmedia processed with pipeline is stored in NAF :param int start: start line :param int end: end line :rtype: generator :return: generator of json objects """ if end: line_range = range(start, end + 1) news_item = namedtuple('news_item', ['signalmedia_json', 'preprocessing']) path_template = '{path_newsreader_nafs}/{identifier}.in.naf' with open(path_signalmedia_json) as infile: for counter, line in enumerate(infile, 1): if end: if counter not in line_range: continue if counter > end: break article = json.loads(line) identifier = article['id'] spacy_naf = spacy_to_naf.text_to_NAF(article['content'], nlp) the_preprocessing = {('spacy', spacy_naf)} if path_newsreader_nafs: path_newsreader_naf = path_template.format_map(locals()) if os.path.exists(path_newsreader_naf): newsreader_naf = etree.parse(path_newsreader_naf) the_preprocessing.add(('newsreader', newsreader_naf)) a_news_item = news_item(signalmedia_json=article, preprocessing=the_preprocessing) yield a_news_item
def text_to_naf(wiki_title, text, wiki_uri, annotations, prefix, language, nlp, dct, output_folder=None): assert language in { 'nl', 'en', 'it' }, f'{language} not part of supported languages: nl it en' # parse with spaCy naf = spacy_to_naf.text_to_NAF(text=text, nlp=nlp, dct=dct, layers={'raw', 'text', 'terms'}, title=wiki_title, uri=wiki_uri, language=language) assert naf.find( 'raw').text == text, f'mismatch between raw text JSON and NAF file' # add hyperlinks as entity elements add_hyperlinks(naf, annotations, prefix) # if wanted, write output to disk if output_folder is not None: if not os.path.exists(output_folder): os.mkdir(output_folder) lang_dir = os.path.join(output_folder, language) if not os.path.exists(lang_dir): os.mkdir(lang_dir) output_path = os.path.join(lang_dir, f'{wiki_title}.naf') with open(output_path, 'w') as outfile: naf_string = spacy_to_naf.NAF_to_string(naf) outfile.write(naf_string) return naf
import sys sys.path.append('..') import spacy from datetime import datetime from spacy_to_naf import text_to_NAF, NAF_to_string nlp = spacy.load('nl_core_news_sm') naf = text_to_NAF( "Hij nam de kat aan.", nlp, dct=datetime.now(), layers={'raw', 'text', 'terms', 'deps'}, naf_version='v3.1', language='nl', layer_to_attributes_to_ignore={'terms': {'morphofeat', 'type'}}, replace_hidden_characters=True, dtd_validation=True) print(NAF_to_string(naf))
import sys sys.path.append('..') import spacy from datetime import datetime from spacy_to_naf import text_to_NAF, NAF_to_string nlp = spacy.load('en_core_web_sm') naf = text_to_NAF('He gave up.', nlp, dct=datetime.now(), layers={'raw', 'text', 'terms'}, naf_version='v3.1', layer_to_attributes_to_ignore={'terms' : {'morphofeat', 'type'}}, dtd_validation=True) print(NAF_to_string(naf))
import sys sys.path.append('..') import spacy from datetime import datetime from spacy_to_naf import text_to_NAF, NAF_to_string nlp = spacy.load('en_core_web_sm') naf = text_to_NAF( 'Tom Cruise is an actor.\n\n\nHe likes to act.', nlp, dct=datetime.now(), layers={'raw', 'text', 'terms'}, naf_version='v4', layer_to_attributes_to_ignore={'terms': {'morphofeat', 'type'}}, replace_hidden_characters=True, map_udpos2naf_pos=True) # map UD pos to NAF pos print(NAF_to_string(naf))
import sys sys.path.append('..') import spacy from lxml import etree from spacy_to_naf import text_to_NAF, NAF_to_string from spacy_to_naf import EntityElement, add_entity_element from datetime import datetime nlp = spacy.load('en_core_web_sm') tree = text_to_NAF('Tom Cruise is an actor.\n\n\nHe likes to act.', nlp, dct=datetime.now(), layers={'raw', 'text', 'terms'}, replace_hidden_characters=True, map_udpos2naf_pos=True) # map UD pos to NAF pos root = tree.getroot() entities_layer = root.find('entities') if entities_layer is None: etree.SubElement(root, "entities") entities_layer = root.find('entities') entity_data = EntityElement(eid='1', entity_type='None', targets=['t1', 't2'], text='Tom Cruise', ext_refs=[{ 'reference':
import sys sys.path.append('..') import spacy from lxml import etree from spacy_to_naf import text_to_NAF, NAF_to_string from spacy_to_naf import EntityElement, add_entity_element from datetime import datetime nlp = spacy.load('en_core_web_sm') NAF = text_to_NAF( 'The man saw the bird. The woman gave the gift to the person.', nlp, dct=datetime.now(), layers={'raw', 'text', 'terms', 'deps'}, replace_hidden_characters=True, map_udpos2naf_pos=True) # map UD pos to NAF pos print(NAF_to_string(NAF))
def run_spacy_on_wiki_text_and_add_hyperlinks(wiki_title, prefix, language, nlp, wiki_folder, wiki_uri2relative_path, dct, output_folder=None, verbose=0): """ :param str wiki_title: Wikipedia article title, e.g., "President van Frankrijk" :param str language: supported: 'nl' | 'en' | 'it' :param nlp: loaded spaCy model, i.e., results of calling spacy.load('MODELNAME') :param str wiki_folder: path to where extracted Wikipedia output is stored, e.g, the folder "wiki", with subfolders for the output per language :param datetime.datetime dct: document creation time, date of crawling for Wikipedia :param output_folder: if provided, the NAF file will be written to output_folder/LANGUAGE/WIKI_TITLE.naf :rtype: tuple :return: (succes, reason, naf) """ succes = True reason = 'succes' naf = None assert language in {'nl', 'en', 'it'}, f'{language} not part of supported languages: nl it en' # try to retrieve JSON of Wikipedia article wiki_uri = f'{prefix}{wiki_title.replace(" ", "_")}' wiki_uri_encoded = urlencode_wikititle(wiki_title, prefix=prefix) if verbose >= 2: print(wiki_uri) if wiki_uri_encoded not in wiki_uri2relative_path: reason = 'page not extracted' succes = False else: relative_path, line_number = wiki_uri2relative_path[wiki_uri_encoded] path = os.path.join(wiki_folder, relative_path) # load wiki_page wiki_page = {} with bz2.BZ2File(path, "r") as infile: for index, line in enumerate(infile): if index == line_number: wiki_page = json.loads(line) break assert wiki_page, f'index is wrong for {language} {wiki_title}' # parse with spaCy naf = spacy_to_naf.text_to_NAF(text=wiki_page['text'], nlp=nlp, dct=dct, layers={'raw', 'text', 'terms'}, title=wiki_title, uri=wiki_uri, language=language) assert naf.find('raw').text == wiki_page['text'], f'mismatch between raw text JSON and NAF file' # add hyperlinks as entity elements add_hyperlinks(naf, wiki_page['annotations'], prefix, verbose=verbose) # if wanted, write output to disk if output_folder is not None: if not os.path.exists(output_folder): os.mkdir(output_folder) lang_dir = os.path.join(output_folder, language) if not os.path.exists(lang_dir): os.mkdir(lang_dir) output_path = os.path.join(lang_dir, f'{wiki_title}.naf') with open(output_path, 'w') as outfile: naf_string = spacy_to_naf.NAF_to_string(naf) outfile.write(naf_string) if verbose >= 2: print(f'written {wiki_title} ({language}) to {output_path}') message = f'succes:{succes} with reason: {reason} for {wiki_title} ({language})' if verbose >= 3: print(message) if all([verbose == 2, not succes]): print(message) # return message whether it was succesful return succes, reason, naf
import spacy from lxml import etree from spacy_to_naf import text_to_NAF, NAF_to_string from spacy_to_naf import EntityElement, add_entity_element from spacy_to_naf import time_in_correct_format from spacy_to_naf import add_linguisticProcessors_el from datetime import datetime nlp = spacy.load('en_core_web_sm') naf_version = 'v3.1' now = datetime.now() tree = text_to_NAF('Tom Cruise is an actor.\n\n\nHe likes to act.', nlp, dct=now, naf_version=naf_version, layers={'raw', 'text', 'terms'}, replace_hidden_characters=True, map_udpos2naf_pos=False, dtd_validation=True) # map UD pos to NAF pos root = tree.getroot() naf_header = root.find('nafHeader') time_as_string = time_in_correct_format(now) modelname = 'Wikipedia hyperlinks' add_linguisticProcessors_el(naf_header, layer='entities', start_time=time_as_string, end_time=time_as_string, modelname=modelname)
import sys sys.path.append('..') import spacy from datetime import datetime from spacy_to_naf import text_to_NAF, NAF_to_string nlp = spacy.load('it_core_news_sm') naf = text_to_NAF( 'Tom Cruise is an actor.\n\n\nHe likes to act.', nlp, dct=datetime.now(), layers={'raw', 'text', 'terms'}, naf_version='v3.1', layer_to_attributes_to_ignore={'terms': {'morphofeat', 'type'}}, dtd_validation=True) # map UD pos to NAF pos