示例#1
0
def get_docs_stats():
    logging.root.setLevel(logging.ERROR)

    acumm_stats_before = [0 for _ in voz.Document.get_stats_labels()]
    acumm_stats_after = [0 for _ in voz.Document.get_stats_labels()]
    acumm_count = 0
    logging.basicConfig(level=logging.WARNING)
    file_path = settings.STY_FILE_PATH
    for sty_file in settings.STY_FILES:
        acumm_count +=1
        logger.info("Processing %s" % sty_file)
        doc = styhelper.create_document_from_sty_file(file_path+sty_file)
        doc_stats = doc.get_stats()
        for i in xrange(len(acumm_stats_before)):
            acumm_stats_before[i]+=doc_stats[i]
        quoted_speech_file = sty_file.split()[0] + "/sentences.tsv.csv"
        quotedspeechhelper.annotate_sentences(doc, file_path + quoted_speech_file)
        quotedspeechhelper.clean_quoted_speech_from_document(doc)
        doc_stats = doc.get_stats()
        for i in xrange(len(acumm_stats_before)):
            acumm_stats_after[i]+=doc_stats[i]
        #break

    print "Counts"
    #print voz.Document.format_stats(acumm_stats_before)
    print voz.Document.format_stats(acumm_stats_after)
    print "Averages"
    for i in xrange(len(acumm_stats_before)):
        #acumm_stats_before[i]=1.0*acumm_stats_before[i]/acumm_count
        acumm_stats_after[i]=1.0*acumm_stats_after[i]/acumm_count
    #print voz.Document.format_stats(acumm_stats_before)
    print voz.Document.format_stats(acumm_stats_after)
示例#2
0
def generate_filtered_text_files():
    """
    Generate files to be processed by parsers.
    ClearNLP:
     source /Users/josepvalls/soft/clearnlp/setup_classpath.sh
     java -Xmx5g -XX:+UseConcMarkSweepGC edu.emory.clir.clearnlp.bin.NLPDecode -mode ner -c config_decode_ner.xml -i /Users/josepvalls/voz2/stories/dialog_filtered -ie txt
    ClearNLP Coref
     java /Users/josepvalls/Dropbox/projects/clearnlp/src/main/java/edu/drexel/valls
    Stanford CoreNLP
     use CoreNLP server cache
    Open NLP
     sh /Users/josepvalls/Dropbox/projects/coref-opennlp/Coref/run.cmd
    """
    logging.basicConfig(level=logging.DEBUG)
    file_path = settings.STY_FILE_PATH
    for sty_file in settings.STY_FILES:
        logger.info("Processing %s" % sty_file)
        quoted_speech_file = sty_file.split()[0] + "/sentences.csv"
        doc = styhelper.create_document_from_sty_file(file_path + sty_file)
        quotedspeechhelper.annotate_sentences(doc,
                                              file_path + quoted_speech_file)
        sentences = []
        for sentence in doc.sentences:
            if sentence.is_normal():
                sentences.append(sentence.get_text() + '\n')
        file_name = settings.STORY_TXT_PATH + str(doc.id) + '.txt'
        logger.info("Writing %d sentences to %s" % (len(sentences), file_name))
        with open(file_name, 'w') as f:
            f.writelines(sentences)
示例#3
0
def get_docs_stats(feature_group,feature_distribution):
    tsv = None
    arff = None
    idxlst = ''
    #logging.root.setLevel(logging.ERROR)
    file_path = settings.STY_FILE_PATH
    documents = []
    #for sty_file in []:
    for sty_file in settings.STY_FILES[14:]:
    #for sty_file in settings.STY_FILES:
    #for sty_file in ['03 - Bukhtan Bukhtanovich.sty']:
        try:
            0/0
            doc = voz.create_document_from_jsonpickle_file('/Users/josepvalls/temp/voz2/'+sty_file+'.json')
            logger.info("Loading JSON %s" % sty_file)
        except:
            logger.info("Processing %s" % sty_file)
            quoted_speech_file = sty_file.split()[0] + "/sentences.csv"
            quoted_speech_file = "all_sentences.tsv"
            doc = styhelper.create_document_from_sty_file(file_path+sty_file)
            if DO_REMOVE_DIALOG:
                quotedspeechhelper.annotate_sentences(doc, file_path + quoted_speech_file, format='tsv', single_sentences_file_story_id=doc.id)
                quotedspeechhelper.clean_quoted_speech_from_document(doc)
            doc.serialize_to_file(TEMP_CACHE_PATH+sty_file+'.json',use_deep_copy=True)
        # print util.string_as_print(doc.id,doc.properties.get('afanasev_new',doc.id),doc.properties.get('afanasev_old',doc.id), doc.narrative.format_summary())
        documents.append(doc)
    if False and not DO_USE_OLD_AUTO_DATA_INSTEAD_OF_STY_GT:
        for document_id in [1001,1002,1003,1004,2001]:
        #for document_id in [1004]:
            documents.append(oldannotationhelper.load_old_annotations_into_document(document_id))
    for doc in documents:
        import narrativehelper
        narrativehelper.VERB_FEATURES = feature_group
        narrativehelper.DO_COMPUTE_ROLE_DISTRIBUTION = feature_distribution
        narrativehelper.DO_USE_OLD_AUTO_DATA_INSTEAD_OF_STY_GT = DO_USE_OLD_AUTO_DATA_INSTEAD_OF_STY_GT
        assert isinstance(doc,voz.Document)
        doc.narrative.filter_non_actual_default = DO_FILTER_NONACTUAL
        doc.narrative.compute_features()
        print doc.id,"Narrative: ",doc.narrative.format(options={'one_liner':True,'use_function_group':True})
        continue
        print sum([i.tokens_count for i in doc.narrative.function_list])
        if DO_WRITE_FILES:
            for _ in doc.narrative.functions():
                idxlst += "%d\n" % doc.id
            if not tsv:
                tsv = doc.narrative.format_tsv()
                arff = doc.narrative.format_arff()
                idxlst
            else:
                tsv += doc.narrative.format_tsv(False)
                arff += doc.narrative.format_arff(False)
        if DO_PRINT_TO_SCREEN:
            #print doc.id
            for function in doc.narrative.functions():
                print doc.id,function.get_feature_vector()
    if DO_WRITE_FILES:
        open('tool_corpus_functions_summary/story_indices%s%s.txt' % (('_filtered' if DO_FILTER_NONACTUAL else ''),('_nodiag' if DO_REMOVE_DIALOG else '')),'w').write(idxlst)
        open('tool_corpus_functions_summary/tool_corpus_functions_summary_%d_%s%s%s%s.tsv' % (feature_group,'dist' if feature_distribution else 'abs','_filtered' if DO_FILTER_NONACTUAL else '','_auto' if DO_USE_OLD_AUTO_DATA_INSTEAD_OF_STY_GT else '','_nodiag' if DO_REMOVE_DIALOG else ''), 'w').write(tsv)
        open('tool_corpus_functions_summary/tool_corpus_functions_summary_%d_%s%s%s%s.arff' % (feature_group,'dist' if feature_distribution else 'abs','_filtered' if DO_FILTER_NONACTUAL else '','_auto' if DO_USE_OLD_AUTO_DATA_INSTEAD_OF_STY_GT else '','_nodiag' if DO_REMOVE_DIALOG else ''), 'w').write(arff)
示例#4
0
def get_stats_docs_verbs():
    docs = []
    for sty_file in settings.STY_FILES:
        doc = styhelper.create_document_from_sty_file(settings.STY_FILE_PATH + sty_file)
        quotedspeechhelper.annotate_sentences(doc, settings.STORY_ALL_SENTENCES, format='tsv',
                                              single_sentences_file_story_id=doc.id)
        docs.append(doc)
    print sum([len(i.get_all_verbs()) for i in docs])
    print sum([sum([len([k for k in j._objects if k and k.is_independent]) + len(
        [k for k in j._subjects if k and k.is_independent]) for j in i.get_all_verbs()]) for i in docs])
示例#5
0
def generate_filtered_entity_file():
    logging.basicConfig(level=logging.DEBUG)
    file_path = settings.STY_FILE_PATH
    mentions = []
    for sty_file in settings.STY_FILES[2:3]:
        logger.info("Processing %s" % sty_file)
        quoted_speech_file = sty_file.split()[0] + "/sentences.csv"
        doc = styhelper.create_document_from_sty_file(file_path + sty_file)
        quotedspeechhelper.annotate_sentences(doc,
                                              file_path + quoted_speech_file)
        for sentence in doc.sentences:
            assert (isinstance(sentence, voz.Sentence))
            if sentence.annotations.is_normal():
                for mention in sentence.mentions:
                    mentions.append(mention.get_text().lower() + '\n')
    file_name = '/Users/josepvalls/voz2/stories/finlayson-entities.txt'
    logger.info("Writing %d mentions to %s" % (len(mentions), file_name))
    with open(file_name, 'w') as f:
        f.writelines(mentions)
def main_print_stats():
    len_quotes = 0
    len_sentences = 0
    len_verbs = 0
    len_mentions = 0
    len_pp = 0
    len_pn = 0
    len_tokens = 0
    len_tokens_in_quotes = 0
    for story_file in settings.STY_FILES:
        print story_file
        doc = styhelper.create_document_from_sty_file(settings.STY_FILE_PATH +
                                                      story_file)
        #styhelper.fix_sty_annotations(doc)
        quotedspeechhelper.annotate_sentences(
            doc,
            settings.STORY_ALL_SENTENCES,
            single_sentences_file_story_id=doc.id)
        output_tuple = tokenize_document(doc)
        output, quotes, mentions, verbs = output_tuple
        print tokenized_string_to_string(output, 1)
        len_quotes += len(quotes)
        len_verbs += len(verbs)
        len_mentions += len(mentions)
        len_sentences += len(doc.sentences)
        len_pp += len(
            [i for i in mentions if [j for j in i.tokens if j.pos == 'PRP']])
        len_pn += len(
            [i for i in mentions if [j for j in i.tokens if j.pos == 'NNP']])
        len_tokens += len(doc.get_text())
        len_tokens_in_quotes += sum([q.offset_end - q.offset for q in quotes])
    print 'TOTAL NUM QUOTES\t', len_quotes
    print 'TOTAL NUM SENT\t', len_sentences
    print 'TOTAL NUM VERBS\t', len_verbs
    print 'TOTAL NUM MENT\t', len_mentions
    print 'TOTAL NUM PP\t', len_pp
    print 'TOTAL NUM PN\t', len_pn
    print 'TOTAL NUM chars\t', len_tokens
    print 'TOTAL NUM chars in quotes\t', len_tokens_in_quotes
示例#7
0
def create_document_using_stanford_from_filtered_sty_file(sty_file):
    import styhelper, quotedspeechhelper, entitymanager
    stats_not_found = 0
    stats_ambiguous = 0
    stats_match_ok = 0
    logger.info("Processing %s" % sty_file)
    doc = styhelper.create_document_from_sty_file(sty_file)
    quotedspeechhelper.annotate_sentences(
        doc,
        settings.STORY_ALL_SENTENCES,
        format='tsv',
        single_sentences_file_story_id=doc.id)
    text = "\n".join([
        sentence.get_text() for sentence in doc.sentences
        if sentence.annotations.is_normal()
    ])
    doc_new = create_document_from_raw_text(text, {'story_id': doc.id + 1000})
    assert len([
        sentence for sentence in doc.sentences
        if sentence.annotations.is_normal()
    ]) == len(
        doc_new.sentences
    ), "Sentence length mismatch between annotated and processed document"
    fixed_annotation_file = settings.STORY_ANNOTATION_FIXES + '%d.tsv' % doc_new.id
    if not os.path.isfile(fixed_annotation_file):
        # Dump data for fixing
        f_fixes = open(fixed_annotation_file, 'w')
        for sentence in [
                sentence for sentence in doc.sentences
                if sentence.annotations.is_normal()
        ]:
            mentions_check = [
                i for i in sentence.mentions
                if len([j for j in i.tokens if j.pos != 'DT']) > 1
            ]
            mentions_check = sorted(mentions_check,
                                    key=lambda i:
                                    (len(i.child_mentions) * 100 - i.id),
                                    reverse=True)
            while mentions_check:
                mention = mentions_check.pop(0)
                assert isinstance(mention, entitymanager.Mention)
                f_data = mention.get_text() + "\t" + str(
                    mention.get_taxonomy(
                        entitymanager.TaxonomyContainer.TAXONOMY_ENTITY_TYPES)
                ) + ' ' + str(
                    mention.get_taxonomy(
                        entitymanager.TaxonomyContainer.
                        TAXONOMY_CHARACTER_6ROLES)) + ' ' + str(
                            mention.get_coref_group_id())
                f_data = "\t%d\t%d\t%s\n" % (doc_new.id, mention.id, f_data)
                f_fixes.write(f_data)
                for mention_ in mention.child_mentions:
                    f_data = mention_.get_text() + "\t" + str(
                        mention_.get_taxonomy(
                            entitymanager.TaxonomyContainer.
                            TAXONOMY_ENTITY_TYPES)) + ' ' + str(
                                mention_.get_taxonomy(
                                    entitymanager.TaxonomyContainer.
                                    TAXONOMY_CHARACTER_6ROLES)) + str(
                                        mention.get_coref_group_id())
                    f_data = "\t%d\t%d\t - %s\n" % (doc_new.id, mention_.id,
                                                    f_data)
                    f_fixes.write(f_data)
                    try:
                        mentions_check.remove(mention_)
                    except:
                        pass
        f_fixes.close()

    # Annotate
    fixed_annotation_file_extra = settings.STORY_ANNOTATION_FIXES + '%d-extra.tsv' % doc_new.id
    if not os.path.isfile(fixed_annotation_file_extra):
        f_fixes = open(fixed_annotation_file_extra, 'w')
    else:
        f_fixes = None

    for sentence_ref, sentence in zip([
            sentence
            for sentence in doc.sentences if sentence.annotations.is_normal()
    ], doc_new.sentences):
        assert isinstance(sentence, voz.Sentence)
        for mention in sentence.mentions:
            if not mention.is_independent: continue
            assert isinstance(mention, entitymanager.Mention)
            tokens_ref = [sentence_ref.tokens[i.idx] for i in mention.tokens]
            mentions_ref = set(
                filter(None, [
                    sentence_ref._parent_document.get_mention_by_token_id(i.id)
                    for i in tokens_ref
                ]))
            if not mentions_ref:
                logger.warning("UNABLE TO FIND ANNOTATION FOR MENTION %s" %
                               mention.get_text())
                if f_fixes:
                    f_fixes.write(
                        "%d\tMISS\t%s\t%s\n" %
                        (mention.id, mention.get_text(), str(mention)))
                stats_not_found += 1
                continue
            elif not len(mentions_ref) == 1:
                logger.warning("AMBIGUOUS ANNOTATION FOR MENTION")
                stats_ambiguous += 1
                mentions_ref = sorted(mentions_ref,
                                      key=lambda i: len(i.tokens))
                for i in mentions_ref:
                    if mention_ref.get_taxonomy(
                            entitymanager.TaxonomyContainer.
                            TAXONOMY_CHARACTER_6ROLES):
                        mention_ref = i
                        break
                if f_fixes:
                    f_fixes.write("%d\tAMBG\t%s\t%s\t%s\n" %
                                  (mention.id, mention.get_text(),
                                   [str(i)
                                    for i in mentions_ref], mention_ref))
            else:
                mention_ref = mentions_ref.pop()
                stats_match_ok += 1

            if len(
                    mention_ref.get_taxonomy(entitymanager.TaxonomyContainer.
                                             TAXONOMY_ENTITY_TYPES)) > 1:
                logger.info(
                    util.string_as_print(
                        "POTENTIALLY IGNORE", mention_ref,
                        mention_ref.get_taxonomy(
                            entitymanager.TaxonomyContainer.
                            TAXONOMY_ENTITY_TYPES)))
                mention.annotations.split_ignore = True
            mention.annotations.coref = mention_ref.get_coref_group_id()
            mention.annotations.type = \
                (mention_ref.get_taxonomy(entitymanager.TaxonomyContainer.TAXONOMY_ENTITY_TYPES) or ['NA'])[0]
            mention.annotations.role = \
                (mention_ref.get_taxonomy(entitymanager.TaxonomyContainer.TAXONOMY_CHARACTER_6ROLES) or ['NA'])[0]
        sentence.annotations.verbs = sentence_ref.verbs
    if f_fixes:
        f_fixes.close()

    #print stats_not_found, stats_ambiguous, stats_match_ok
    return doc_new
def get_verbs():
    logging.root.setLevel(logging.ERROR)
    file_path = settings.STY_FILE_PATH
    verbs = []
    frames = []
    functions = collections.defaultdict(list)

    import verbmanager
    mapper = verbmanager.VerbMapper(verbmanager.VerbMapper.MODE_FRAMENET_TEXT)

    for sty_file in settings.STY_FILES:
        try:
            0 / 0
            doc = voz.create_document_from_jsonpickle_file(
                '/Users/josepvalls/temp/voz2/' + sty_file + '.json')
            logger.info("Loading JSON %s" % sty_file)
        except:
            logger.info("Processing %s" % sty_file)
            quoted_speech_file = sty_file.split()[0] + "/sentences.csv"
            doc = styhelper.create_document_from_sty_file(file_path + sty_file)
            assert isinstance(doc, voz.Document)
            if DO_REMOVE_DIALOG:
                quotedspeechhelper.annotate_sentences(
                    doc, file_path + quoted_speech_file)
                quotedspeechhelper.clean_quoted_speech_from_document(doc)
            doc.serialize_to_file('/Users/josepvalls/temp/voz2/' + sty_file +
                                  '.json',
                                  use_deep_copy=True)
        #print len(doc.get_all_tokens())
        logger.info(
            util.string_as_print(doc.id,
                                 doc.properties.get('afanasev_new', doc.id),
                                 doc.properties.get('afanasev_old', doc.id),
                                 doc.narrative.format_summary()))
        assert isinstance(doc, voz.Document)
        doc.narrative.compute_features()
        print sum([
            f.tokens_count
            for f in doc.narrative.functions(filter_non_actual=False)
        ])
        continue

        for f in doc.narrative.functions:
            assert isinstance(f, voz.narrativehelper.NarrativeFunction)

            #functions[f.function_group].extend([i.token.lemma for i in f._verbs])
            functions[f.function_group].extend([
                mapper.map(i.token.lemma, fallback=False)
                for i in doc.get_all_verbs()
            ])
        verbs.extend([i.token.text for i in doc.get_all_verbs()])
        #frames.update([i.frame for i in doc.get_all_verbs()])
        #frames.extend(filter(None,[mapper.map(i.token.lemma,fallback=False) for i in doc.get_all_verbs()]))
        frames.extend([
            mapper.map(i.token.lemma, fallback=False)
            for i in doc.get_all_verbs()
        ])

        #break
    sys.exit()
    roots = util.flatten(
        util.flatten([[i.root_hypernyms() for i in wn.synsets(verb, 'v')]
                      for verb in verbs]))
    print len(verbs), len(set(verbs))
    print len(frames), len(set(frames))
    print len(roots)
    print collections.Counter(roots).most_common()
    print collections.Counter(frames).most_common()
    print collections.Counter(verbs).most_common()
    pprint.pprint(functions)
    vozbase.serialize_to_file([verbs, frames, functions],
                              '/Users/josepvalls/temp/voz2/verbs.json', False,
                              False)
    mapper.save_cache()
def make_coreferences():
    documents = []

    file_path = settings.STY_FILE_PATH

    vars_names = ['p1','r1','f1','p0','r0','f0','length','length**2','count1','count0','mentions_characters','char_uniq','coref_groups','c/gr','gr/c','eval']
    num_vars = len(vars_names)

    matrices_to_compute = ['OLD_STANFORD_COREF','OLD_NAME_COREF']+['OLD_RESTRICTION','OLD_TYPE']+['OLD_IDX']
    matrices_to_merge = ['OLD_STANFORD_COREF','OLD_NAME_COREF']+['OLD_RESTRICTION','OLD_TYPE']
    # OLD_ROLE_PRED1
    # OLD_ROLE_GT
    #matrices_to_print = matrices_to_compute+["AGGREGATED"]
    matrices_to_print = ["AGGREGATED"]
    cumulative = dict([(i,[0.0]*num_vars) for i in matrices_to_compute+["AGGREGATED"]])

    for sty_file in settings.STY_FILES:
        logger.info("Processing %s" % sty_file)
        quoted_speech_file = sty_file.split()[0]+"/sentences.csv"
        doc = styhelper.create_document_from_sty_file(file_path+sty_file)
        assert isinstance(doc,voz.Document)
        quotedspeechhelper.annotate_sentences(doc, file_path + quoted_speech_file)
        quotedspeechhelper.clean_quoted_speech_from_document(doc)
        doc.coreference_aux[voz.entitymanager.TaggableContainer.TAG_CHARACTER_SYMBOL] = doc.coreference
        mentions = doc.get_all_mentions()
        mentions = [i for i in mentions if i.is_independent]
        mentions = [i for i in mentions if 'CH' in i.get_taxonomy(voz.entitymanager.TaxonomyContainer.TAXONOMY_NONCHARACTER)]
        '''# create stanford, name, roles coref
        for coref_key in matrices_to_eval:
            coref_ids =  sorted([i for i in set(util.flatten([mention.get_tag(coref_key) for mention in mentions]))])
            print "mentions, coref_ids",len(mentions),len(coref_ids),coref_ids
            doc.coreference_aux[coref_key] = voz.entitymanager.Coreference(doc)
            for coref_id in coref_ids:
                mentions_coref = [i for i in mentions if coref_id in i.get_tag(coref_key)]
                doc.coreference_aux[coref_key].create_coref_group_and_entity_from_mentions(doc.get_symbol_id(coref_id,'COREF_SYMBOL'),coref_id,mentions_coref)

        # eval coref
        print voz.Document.format_stats(doc.get_stats())'''

        # eval the individual matrices and compute their table for aggregation later
        tables_to_merge = []
        table_gt_temp = None
        for coref_key in matrices_to_compute:
            print coref_key
            table,individual = voz.entitymanager.Coreference.eval_prf(coref_key,mentions)
            if table_gt_temp is None:
                table_gt_temp = table
            if coref_key in matrices_to_merge:
                tables_to_merge.append((table,individual))
            for i in xrange(num_vars):
                cumulative[coref_key][i]+=individual[i]
        # aggregate the tables and evaluate aggregation
        coref_key = "AGGREGATED"
        merge_matrices(mentions,tables_to_merge,table_gt_temp)
        table,individual = voz.entitymanager.Coreference.eval_prf(coref_key,mentions)
        for i in xrange(num_vars):
            cumulative[coref_key][i]+=individual[i]

        #break # sty_file

    for j in matrices_to_print:
        for i in xrange(num_vars):
            cumulative[j][i]=cumulative[j][i]/cumulative[j][7]
    print 'CUMMULATIVE OVER STORIES'
    for j in matrices_to_print:
        print j
        for i in xrange(num_vars):
            print "%s\t%f" % (vars_names[i],cumulative[j][i])
        for i in xrange(num_vars-3,num_vars):
            print "%s\t%f" % (vars_names[i],cumulative[j][i]/15.0)
        avg = 1.0 * (cumulative[j][2]*cumulative[j][8]+cumulative[j][3]*cumulative[j][9])/(cumulative[j][8]+cumulative[j][9])
        print "error\t%f\t%f" % (avg,1-avg)