Пример #1
0
def do_the_business():
    # open logfile
    with open(logfile_name,'w') as logfile:

        # some details
        broadcast(logfile,"File list contains %d files"%len(file_list))

        # delete the database
        if do_delete_db:
            os.remove(dbfile_name)

        # analysis stage
        if do_analyse:
            start = datetime.now()
            analyse.analyse(file_list=file_list,dbfile_name=dbfile_name,logfile=logfile,use_multiprocessing=use_multiprocessing,rel_judgment_dir=rel_judgment_dir)
            elapsed = datetime.now() - start
            broadcast(logfile,"Analyse phase took %s"%elapsed)

        # crossreference stage
        if do_crossreference:
            start = datetime.now()
            crossreference.crossreference(file_list=file_list,dbfile_name=dbfile_name,logfile=logfile,use_multiprocessing=use_multiprocessing)
            elapsed = datetime.now() - start
            broadcast(logfile,"Crossreference phase took %s"%elapsed)

        # convert stage
        if do_convert:
            conversion_start = time.time()
            start = datetime.now()
            convert.convert(file_list=file_list,dbfile_name=dbfile_name,logfile=logfile,public_html_dir=public_html_dir,use_multiprocessing=use_multiprocessing,do_legislation=do_legislation)
            elapsed = datetime.now() - start
            broadcast(logfile,"Convert phase took %s"%elapsed)
            if do_delete_html:
                delete_html.delete_html(conversion_start,output_dir)

        # disambiguation stage
        if do_disambiguation:
            disambiguation_start = time.time()
            start = datetime.now()
            disambiguation.disambiguation(file_list=file_list,dbfile_name=dbfile_name,logfile=logfile,output_dir=output_dir,use_multiprocessing=use_multiprocessing)
            elapsed = datetime.now() - start
            broadcast(logfile,"Disambiguation phase took %s"%elapsed)

        # index stage
        if do_index:
            start = datetime.now()
            indexes.make_indexes(dbfile_name=dbfile_name,logfile=logfile,output_dir=output_dir,use_multiprocessing=use_multiprocessing)
            elapsed = datetime.now() - start
            broadcast(logfile,"Index phase took %s"%elapsed)
Пример #2
0
            log.error(
                "error during tokenization of file '{0}', exiting".format(
                    filename))
            continue
        tokenized = "\n".join([' '.join(sentence) for sentence in tokens[:-1]])
    else:
        tokenized = text

    log.info("Parsing")
    drs = get_all(tokenized)
    if not drs:
        log.error("error during the execution of Boxer on file '{0}', exiting".
                  format(filename))
        continue
    log.info("Word sense disambiguation and entity linking")
    synsets, entities = disambiguation(tokenized, drs)
    if synsets == None or entities == None:
        log.error(
            "error during the disambiguation of file '{0}', exiting".format(
                filename))
        continue

    # extracting co-mentions
    if options.comentions:
        dbpedia_entities = set(map(lambda x: x['entity'], entities))
        for entity1, entity2 in combinations(dbpedia_entities, 2):
            if (entity1 != 'null' and entity2 != 'null'):
                triples.append(
                    ('<{0}>'.format(entity1), '<{0}#comention>',
                     '<{2}>'.format(config.get('namespace', 'relation'),
                                    entity2)))
Пример #3
0
        log.info("Parsing with Boxer")
        semantics = candc.get_all(tokenized)
        if not semantics:
            log.error("error during the execution of Boxer on file '{0}', exiting".format(filename))
            continue

    elif config.get('semantics', 'module') == 'semafor':
        log.info("Parsing with Semafor")
        semantics, tokenized = semafor.parse(text)

        if not semantics:
            log.error("error during the execution of Semafor on file '{0}', exiting".format(filename))
            continue

    log.info("Word sense disambiguation and entity linking")
    synsets, entities = disambiguation(tokenized, semantics)
    if synsets==None or entities==None:
        log.error("error during the disambiguation of file '{0}', exiting".format(filename))
        continue

    # extracting co-mentions
    if options.comentions:
        dbpedia_entities = set(map(lambda x: x['entity'], entities))
        for entity1, entity2 in combinations(dbpedia_entities, 2):
            if (entity1 != 'null' and
                entity2 != 'null'):
                triples.append(('<{0}>'.format(entity1), '<{0}#comention>', '<{2}>'.format(config.get('namespace', 'relation'), entity2)))

    # build dictionary of variables
    try:
        variables = dict()
Пример #4
0
        tokens = tokenize(text)
        if not tokens:
            log.error("error during tokenization of file '{0}', exiting".format(filename))
            continue
        tokenized = "\n".join([' '.join(sentence) for sentence in tokens[:-1]])
    else:
        tokenized = text


    log.info("Parsing")
    drs = get_all(tokenized)
    if not drs:
        log.error("error during the execution of Boxer on file '{0}', exiting".format(filename))
        continue
    log.info("Word sense disambiguation and entity linking")
    synsets, entities = disambiguation(tokenized, drs)
    if synsets==None or entities==None:
		log.error("error during the disambiguation of file '{0}', exiting".format(filename))
		continue

    # extracting co-mentions
    if options.comentions:
        dbpedia_entities = set(map(lambda x: x['entity'], entities))
        for entity1, entity2 in combinations(dbpedia_entities, 2):
            if (entity1 != 'null' and
                entity2 != 'null'):
                triples.append(('<{0}>'.format(entity1), '<{0}#comention>', '<{2}>'.format(config.get('namespace', 'relation'), entity2)))

    # build dictionary of variables
    try:
        variables = dict()