예제 #1
0
def read(serifxml_dir):
    serifxml_files = []
    for filename in os.listdir(serifxml_dir):
        docid = filename
        if filename.endswith(".serifxml"):
            docid = filename[0:-(len(".serifxml"))]
        elif filename.endswith(".xml"):
            docid = filename[0:-(len(".xml"))]
        else:
            docid = None
        if docid is None:
            print("Skipping file: " + filename)
            continue
        serifxml_files.append((
            docid,
            os.path.join(serifxml_dir, filename),
        ))

    for docid, serifxml_file in serifxml_files:
        print("==== " + serifxml_file)

        serif_doc = serifxml.Document(serifxml_file)

        for s in serif_doc.sentences:
            st = s.sentence_theories[0]

            for serif_em in st.event_mention_set:
                event_type = serif_em.event_type
                anchor_text = serif_em.anchor_node.text
                confidence = serif_em.score

                print(event_type)
                print(sanitize(anchor_text))
                print(sanitize(get_snippet(serif_doc, st)))
예제 #2
0
def read(serifxml_dir):
    print "Reading from: " + serifxml_dir
    serifxml_files = []
    for filename in os.listdir(serifxml_dir):
        docid = filename
        if filename.endswith(".serifxml"):
            docid = filename[0:-(len(".serifxml"))]
        elif filename.endswith(".xml"):
            docid = filename[0:-(len(".xml"))]
        else:
            docid = None
        if docid is None:
            print "Skipping file: " + filename
            continue
        serifxml_files.append((docid, os.path.join(serifxml_dir, filename),))

    for docid, serifxml_file in serifxml_files:
        print ("Reading " + serifxml_file)

        serif_doc = serifxml.Document(serifxml_file)

        for s in serif_doc.sentences:
            st = s.sentence_theories[0]

            for serif_em in st.event_mention_set:
                event_type=serif_em.event_type
                anchor_text=serif_em.anchor_node.text
                confidence = serif_em.score

                #if "." not in event_type:
                print "em\t" + event_type + "\t" + anchor_text + "\t" + str(confidence)
예제 #3
0
def to_lingo_doc(filepath):
    """Takes in a filepath to a SerifXML, and use its sentences, tokens, entity-mentions, value-mentions
    to construct a nlplingo.text.text_theory.Document
    Returns: nlplingo.text.text_theory.Document
    """
    serif_doc = serifxml.Document(filepath)
    """:type: serifxml.Document"""

    docid = serif_doc.docid
    lingo_doc = lingoDoc(docid)
    for st_index, sentence in enumerate(serif_doc.sentences):
        st = sentence.sentence_theories[0]
        """:type: serifxml.SentenceTheory"""
        if len(st.token_sequence) == 0:
            continue
        st_text, st_start, st_end = get_snippet(serif_doc, st)

        tokens = to_tokens(st)
        assert st_start == tokens[0].start_char_offset()
        assert (st_end+1) == tokens[-1].end_char_offset()

        s = Sentence(docid, IntPair(st_start, st_end+1), st_text, tokens, st_index)
        add_entity_mentions(st, s, lingo_doc)
        add_value_mentions(st, s, lingo_doc)
        add_names(st, lingo_doc)

        lingo_doc.add_sentence(s)
    return lingo_doc
예제 #4
0
def extractSentences(serifXmlPath):
    serif_doc = serifxml.Document(serifXmlPath)

    doc_tokens = []
    doc_offsets = []
    for s in serif_doc.sentences:
        st = s.sentence_theories[0]
        (tokens, offsets) = get_tokenized_text(st)
        doc_tokens.append(tokens)
        doc_offsets.append(offsets)

    return (doc_tokens, doc_offsets)
예제 #5
0
def record_doc_vocab(filepath, vocab_locations):
    """
    :type vocab_locations: dict[str, defaultdict(list)]
    """
    serif_doc = serifxml.Document(filepath)
    """:type: serifxml.Document"""

    docid = serif_doc.docid
    for st_index, sentence in enumerate(serif_doc.sentences):
        #st = sentence.sentence_theories[0]
        """:type: serifxml.SentenceTheory"""
        #st_text, st_start, st_end = get_snippet(serif_doc, st)

        tokens = to_tokens(sentence)
        record_unigram_info(docid, tokens, vocab_locations, sentence)
        record_bigram_info(docid, tokens, vocab_locations, sentence)
예제 #6
0
    def extractSentences(self, serifXmlPath):
        self.sentenceFile = NamedTemporaryFile('w', delete=False)
        self.sentenceFile.close()

        o = codecs.open(self.sentenceFile.name, 'w', encoding='utf8')

        serif_doc = serifxml.Document(serifXmlPath)

        doc_tokens = []
        doc_offsets = []
        for s in serif_doc.sentences:
            st = s.sentence_theories[0]
            (tokens, offsets) = self.get_tokenized_text(st)
            doc_tokens.append(tokens)
            doc_offsets.append(offsets)
            o.write(' '.join(tokens) + "\n")

        o.close()

        #return self.sentenceFile.name
        return (self.sentenceFile.name, doc_tokens, doc_offsets)
예제 #7
0
files_length = len(serifxml_files)
count = 0

actor_id_to_entity_group = dict()

for docid, serifxml_file in serifxml_files:
    mention_map = dict()  # maps serif mention to KBMention
    event_map = dict(
    )  # maps serif event mention (or icews event mention) to (KBEvent, KBEventMention, SentenceTheory)

    count += 1
    print "SerifXMLReader producing KB objects in: " + docid + " (" + str(
        count) + "/" + str(files_length) + ")"

    serif_doc = serifxml.Document(serifxml_file)

    for serif_event in serif_doc.event_set:
        event_type = serif_event.event_type

        print(
            "=== Event: " + event_type
        )  # + "\ttense: " + serif_event.tense.value + " genericity: " + serif_event.genericity.value + " modality: " + serif_event.modality.value + " polarity: " + serif_event.polarity.value)

        for argument in serif_event.arguments:
            if argument.entity is not None:
                canonical_name = "NA"
                if argument.entity.canonical_name is not None:
                    canonical_name = argument.entity.canonical_name
                print("    -" + str(argument.role) + ": " +
                      argument.entity.entity_type + "\t" +
예제 #8
0
        docid = filename[0:-(len(".serifxml"))]
    elif filename.endswith(".sgm.xml"):
        docid = filename[0:-(len(".sgm.xml"))]
    else:
        docid = None

    if docid is None:
        continue

    doc_num += 1
    print("[cross-doc] Doc #" + str(doc_num) + ": " + docid)

    # outfile = os.path.join(output_dir, docid + ".json")
    # o = codecs.open(outfile, 'w', encoding='utf8')

    serif_doc = serifxml.Document(os.path.join(input_dir, filename))

    processor.produce_entities(docid, serif_doc)

if include_entities:

    for eid in processor.eid_to_entity:
        entity = processor.eid_to_entity[eid]
        result["all"]["entities"].append(entity)
        # If we have 3 doc types, we'll split the entity into 3,
        # each containing the mentions from a particular doc type
        # doc_type_to_entity is a dict that maps doc_type to a single entity
        doc_type_to_entity = entity.split_into_doc_types(docid_to_doc_type_map)
        for doc_type in doc_type_to_entity:
            entity = doc_type_to_entity[doc_type]
            result[doc_type]["entities"].append(entity)
예제 #9
0
# CREATE INPUT TEXT FILE

# Text file containing sentences that we will run on
text_filename = basename
if not text_filename.endswith(".txt"):
    text_filename = text_filename + ".txt"
outfile = os.path.join(output_dir, text_filename)
o = codecs.open(outfile, 'w', encoding='utf8')

# Metadata file containing char offset mapping from text file to
# serifxml file
metadata_filename = basename + ".meta"
metadata_file = os.path.join(metadata_dir, metadata_filename)
m = open(metadata_file, 'w')

document = serifxml.Document(input_file)
current_output_offset = 0
for sentence in document.sentences:
    original_sentence_text = sentence.text
    sentence_text = original_sentence_text.replace("\r", " ")
    sentence_text = sentence_text.replace("\n", " ")
    o.write(sentence_text + "\n")
    m.write(str(current_output_offset) + " " + str(sentence.start_char) + "\n")
    current_output_offset += len(sentence.text) + 1  # +1 for newline character
o.close()
m.close()

# RUN DISCOURSE PARSER

time_limit = 1800
os.chdir(config_dir)