def processing(letter): count = 0 errors = open(os.getcwd() + '/' + letter + '_errors.txt', "w") articles = listdir(paths.pathCorpus+'/'+letter) print letter for name_article in articles: patharticle = paths.pathCorpus + '/'+letter+'/'+name_article+"/article" pathentites = paths.pathCorpus +'/'+letter+'/'+name_article + "/res.json" try: json_of_article = marking.read_json(pathentites) article = marking.read_article(patharticle) first_mistake = len(article) for entity in json_of_article: if len(entity) == 0: continue for pair in entity['Boundaries']: if article[pair[0]:pair[0]+2].lower() != entity['Entity'][0:2]: if first_mistake > pair[0] and pair[0] != 0: first_mistake = pair[0] if first_mistake != len(article): line = name_article + " " + str(first_mistake) + " " + str(len(article)) + '\n' errors.write(line) count += 1 except Exception: continue errors.write("\n\n\n" + str(count)) errors.close()
import marking import os import sys import argparse from os import listdir parser = argparse.ArgumentParser() parser.add_argument('--pathCorpus', default=os.getcwd() + os.sep + 'Corpus') parser.add_argument('--pathHTMLs', default=os.getcwd() + os.sep + 'NERCorpus') paths = parser.parse_args(sys.argv[1:]) if os.path.exists(paths.pathHTMLs) == False: os.mkdir(paths.pathHTMLs) allfiles = listdir(paths.pathCorpus) for q1 in allfiles: q2 = listdir(paths.pathCorpus + os.sep + q1) print q1 if os.path.exists(paths.pathHTMLs + os.sep + q1) == False: os.mkdir(paths.pathHTMLs + os.sep + q1) for q3 in q2: path = paths.pathCorpus + os.sep + q1 + os.sep + q3 + os.sep + "article" pathout = paths.pathCorpus + os.sep + q1 + os.sep + q3 + os.sep + "res.json" try: json_of_article = marking.read_json(pathout) article = marking.read_article(path) entities = marking.get_entities(json_of_article) marking.make_html(article, entities, paths.pathHTMLs + os.sep + q1 + os.sep + q3 + ".html") except Exception: continue