Пример #1
0
def processing(letter):
    count = 0
    errors = open(os.getcwd() + '/' + letter + '_errors.txt', "w")
    articles = listdir(paths.pathCorpus+'/'+letter)
    print letter
    for name_article in articles:
        patharticle = paths.pathCorpus + '/'+letter+'/'+name_article+"/article"
        pathentites = paths.pathCorpus +'/'+letter+'/'+name_article + "/res.json"
        try:
            json_of_article = marking.read_json(pathentites)
            article = marking.read_article(patharticle)
            first_mistake = len(article)
            for entity in json_of_article:
                if len(entity) == 0:
                    continue
                for pair in entity['Boundaries']:
                    if article[pair[0]:pair[0]+2].lower() != entity['Entity'][0:2]:
                        if first_mistake > pair[0] and pair[0] != 0:
                            first_mistake = pair[0]
            if first_mistake != len(article):
                line = name_article + " " + str(first_mistake) + " " + str(len(article)) + '\n'
                errors.write(line)
                count += 1
        except Exception:
            continue
    errors.write("\n\n\n" + str(count))
    errors.close()
Пример #2
0
import marking
import os
import sys
import argparse
from os import listdir

parser = argparse.ArgumentParser()
parser.add_argument('--pathCorpus', default=os.getcwd() + os.sep + 'Corpus')
parser.add_argument('--pathHTMLs', default=os.getcwd() + os.sep + 'NERCorpus')
paths = parser.parse_args(sys.argv[1:])
if os.path.exists(paths.pathHTMLs) == False:
    os.mkdir(paths.pathHTMLs)
allfiles = listdir(paths.pathCorpus)
for q1 in allfiles:
    q2 = listdir(paths.pathCorpus + os.sep + q1)
    print q1
    if os.path.exists(paths.pathHTMLs + os.sep + q1) == False:
        os.mkdir(paths.pathHTMLs + os.sep + q1)
    for q3 in q2:
        path = paths.pathCorpus + os.sep + q1 + os.sep + q3 + os.sep + "article"
        pathout = paths.pathCorpus + os.sep + q1 + os.sep + q3 + os.sep + "res.json"
        try:
            json_of_article = marking.read_json(pathout)
            article = marking.read_article(path)
            entities = marking.get_entities(json_of_article)
            marking.make_html(article, entities, paths.pathHTMLs + os.sep + q1 + os.sep + q3 + ".html")
        except Exception:
            continue