def get_speculative_lemmas(directory):
    speculative_terms = []
    speculative_wordids = []
    speculative_lemmas = []
    # CHECK WHICH TERM IDS ARE SPECULATIVE IN ALL KAF-FILES FROM DIRECTORY: CREATE SPECULATIVE_TERMS LIST
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print "Error: could not process", filename
        try:
            opinions_layer = root.find("opinions")
            for opinion in opinions_layer.findall("opinion"):
                for expression in opinion.findall("opinion_expression"):
                    polarity = expression.get("polarity")
                    if polarity == "speculation":
                        span = expression.find("span")
                        for target in span.findall("target"):
                            term = target.get("id")
                            term = filename + "_" + term
                            speculative_terms.append(term)
        except:
            print "Error: no opinion found in", filename
    # CHECK WHICH WORD IDS ARE SPECULATIVE USING THE TERMS-WORDS DICTIONARY: CREATE SPECULATIVE_WORDIDS LIST
    dict_terms_lemmas = get_terms_lemmas(directory)
    for term in dict_terms_lemmas:
        lemma = dict_terms_lemmas[term]
        if term in speculative_terms:
            speculative_lemmas.append(lemma)
    speculative_lemmas = list(set(speculative_lemmas))
    return speculative_lemmas
def get_lemmas_per_sentence(directory):
    #Takes all KAF-files in a directory as input and returns a dictionary with the sentence id as key and a list of the tokens as value 
    n = 0
    dict_sentences_lemmas = {}
    for filename in listdir(directory):
        n += 1
        print "Processing file " + str(n) + "out of 504..."
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print 'Error: could not process', filename
        text_layer = root.find("text")
        for wf_element in text_layer.findall("wf"):
            sent_id = wf_element.get("sent")
            word_id = wf_element.get("wid")
            sent_id = filename + "_s" + sent_id
            term_layer = root.find("terms")
            for term in term_layer.findall("term"):
                lemma = term.get("lemma")
                span_layer = term.find("span")
                for target in span_layer.findall("target"):
                    if target.get("id") == word_id:
                        if sent_id not in dict_sentences_lemmas:
                            dict_sentences_lemmas[sent_id] = [lemma]
                        else:
                            dict_sentences_lemmas[sent_id].append(lemma)
    return dict_sentences_lemmas
Пример #3
0
def get_all_tokens(directory):
    #Takes all KAF-files in a directory as input and returns a set of all tokens in these files. 
    all_tokens = []
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print 'Error: could not process', filename
        tokens = get_words(root)
        for token in tokens:
            if not token in all_tokens:
                all_tokens.append(token)
    return all_tokens
Пример #4
0
def get_all_lemmas(directory):
    # Takes all KAF-files in a directory as input and returns a set of all tokens in these files.
    outfile = open("test_tokens.txt", "w")
    all_lemmas = []
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print "Error: could not process", filename
        term_layer = root.find("terms")
        for term in term_layer.findall("term"):
            lemma = term.get("lemma")
            if lemma not in all_lemmas:
                all_lemmas.append(lemma)
    return all_lemmas
def get_sentence_dictionary(directory):
    # Takes all KAF-files from a directory and returns a dictionary with all the sentence ids as key and an empty list as value
    dict_sentences = {}
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print 'Error: could not process', filename   
        text_element = root.find("text")
        for wf_element in text_element.findall("wf"):
            sent_id = wf_element.get("sent")
            sent_id = filename + "_s" + sent_id
            if sent_id not in dict_sentences:
                dict_sentences[sent_id] = []
    return dict_sentences
def get_terms_lemmas(directory):
    # Takes all KAF-files in a directory as input and returns a dictionary with the term id as key and the lemma as value
    dict_terms_lemmas = {}
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print "Error: could not process", filename
        terms_layer = root.find("terms")
        for term in terms_layer.findall("term"):
            term_id = term.get("tid")
            term_id = filename + "_" + term_id
            term_lemma = term.get("lemma")
            dict_terms_lemmas[term_id] = term_lemma
    return dict_terms_lemmas
def get_wordids_words(directory):
    # Takes all KAF-files in a directory as input and returns a dictionary with the word id as key and the word itself as value
    dict_wordids_words = {}
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print "Error: could not process", filename
        text_layer = root.find("text")
        for wf_element in text_layer.findall("wf"):
            word_id = wf_element.get("wid")
            word_id = filename + "_" + word_id
            word = wf_element.text
            dict_wordids_words[word_id] = word
    return dict_wordids_words
def get_hedges_per_sentence(directory):
    """"Takes all KAF-files in a directory as input and returns a dictionary with all the sentences that contain a hedge cue, with the sentence id as key and a list of the hedge cues"""
    hedge_cues =  ["possible", "possibly", "probably", "probable", "might", "may", "possibility", "probability", "presume", "suppose", "suggest", "believe", "think", "if", "perhaps", "maybe", "likely", "could", "speculate", "speculation", "suspect", "guess", "predict", "prediction", "whether", "improbable", "seem", "question", "indicate", "indication", "unsure", "allegedly", "apparently", "favor", "unlikely", "doubt", "assume", "assumption", "certainty", "uncertainty", "certain", 'uncertain', 'questionable', 'ambivalent', 'dubious', 'erratic', 'hazy', 'hesitant', 'insecure', 'unclear', 'undetermined', 'unpredictable', 'predictable', 'unsure', 'speculative', 'indefinite', 'indeterminate', 'doubtful', 'disbelieve', 'potential', 'potentially', 'imaginable',  'fear', 'hope', 'thinkable', 'promising', 'promise', 'hopeful', 'hopefully', 'feasible', 'feasibly', 'reasonably', 'conceivably', 'reasonable', 'conceivable', 'perchance', 'imaginably', 'presumably', 'seemingly', 'assumable', 'expect', 'expectation', 'expectedly', 'consider']
    dict_hedges_per_sentence = {}
    hedge_terms = {}
    hedge_words = {}
    hedges_in_sentences = {}
    # CHECK WHICH TERMS ARE HEDGE CUES
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print 'Error: could not process', filename
        terms_layer = root.find("terms")
        for term in terms_layer.findall("term"):
            term_id = term.get("tid")
            term_id = filename + "_" + term_id
            lemma = term.get("lemma")
            if lemma in hedge_cues:
                hedge_terms[term_id] = lemma
    #print hedge_terms
    # CHECK WHICH WORDS ARE HEDGE CUES
    dict_terms_words = get_terms_words(directory)
    for term in dict_terms_words:
        if term in hedge_terms:
            word_id = dict_terms_words[term]
            lemma = hedge_terms[term]
            hedge_words[word_id] = lemma
    # CHECK WHICH HEDGE CUES EACH SENTENCE CONTAINS AND CREATE DICTIONARY
    dict_sentences_words = get_sentences_words(directory)
    for sent_id in dict_sentences_words:
        span_words = dict_sentences_words[sent_id]
        for word_id in span_words:
            if word_id in hedge_words:
                lemma = hedge_words[word_id]
                if sent_id not in hedges_in_sentences:
                   hedges_in_sentences[sent_id] = [lemma]
                else: 
                    hedges_in_sentences[sent_id].append(lemma)
        if sent_id not in hedges_in_sentences:
            hedges_in_sentences[sent_id] = "No hedge cues"
    return hedges_in_sentences
def get_tokens_per_sentence(directory):
    #Takes all KAF-files in a directory as input and returns a dictionary with the sentence id as key and a list of the tokens as value 
    dict_sentences_tokens = {}
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print 'Error: could not process', filename
        text_layer = root.find("text")
        for wf_element in text_layer.findall("wf"):
            sent_id = wf_element.get("sent")
            sent_id = filename + "_s" + sent_id
            token = wf_element.text
            if sent_id not in dict_sentences_tokens:
                dict_sentences_tokens[sent_id] = [token]
            else:
                dict_sentences_tokens[sent_id].append(token)
    return dict_sentences_tokens
Пример #10
0
def get_terms_words(directory):
    # Takes all KAF-files in a directory as input and returns a dictionary with the term id as key and the word id as value
    dict_terms_words = {}
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print "Error: could not process", filename
        terms_layer = root.find("terms")
        for term in terms_layer.findall("term"):
            term_id = term.get("tid")
            term_id = filename + "_" + term_id
            for span in term:
                for target in span.findall("target"):
                    word_id = target.get("id")
                    word_id = filename + "_" + word_id
                dict_terms_words[term_id] = word_id
    return dict_terms_words
Пример #11
0
def get_sentences_words(directory):
    # Takes all KAF-files in a directory as input and returns a dictionary with the sentence id as key and a list of the word ids as value
    dict_sentences_words = {}
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print "Error: could not process", filename
        text_layer = root.find("text")
        for wf_element in text_layer.findall("wf"):
            sent_id = wf_element.get("sent")
            sent_id = filename + "_s" + sent_id
            word_id = wf_element.get("wid")
            word_id = filename + "_" + word_id
            if sent_id not in dict_sentences_words:
                dict_sentences_words[sent_id] = [word_id]
            else:
                dict_sentences_words[sent_id].append(word_id)
    return dict_sentences_words
def get_opinionated_sentences(directory):
    """Takes all KAF-files from a directory and returns a set of all the speculative sentences in these files (sentence ids)."""
    opinionated_terms = []
    opinionated_words = []
    opinionated_sentences = []
    # CHECK WHICH TERMS ARE SPECULATIVE IN ALL KAF-FILES FROM DIRECTORY: CREATE SPECULATIVE_TERMS LIST
    print "Collecting opinionated terms"
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print "Error: could not process", filename
        try:
            opinions_layer = root.find("opinions")
            for opinion in opinions_layer.findall("opinion"):
                for expression in opinion.findall("opinion_expression"):
                    span = expression.find("span")
                    for target in span.findall("target"):
                        term = target.get("id")
                        term = filename + "_" + term
                        opinionated_terms.append(term)
        except:
            print "Error: no opinion found in", filename
    # CHECK WHICH WORDS ARE SPECULATIVE USING THE TERMS-WORDS DICTIONARY: CREATE SPECULATIVE_WORDS LIST
    print "Collecting opinionated words"
    dict_terms_words = get_terms_words(directory)
    for term in dict_terms_words:
        if term in opinionated_terms:
            word_id = dict_terms_words[term]
            opinionated_words.append(word_id)
    # CHECK WHICH SENTENCES ARE SPECULATIVE USING THE SENTENCES-WORDS DICTIONARY: ADD TO SPECULATIVE_SENTENCES SET
    print "Collecting opinionated sentences"
    dict_sentences_words = get_sentences_words(directory)
    for sent_id in dict_sentences_words:
        span_words = dict_sentences_words[sent_id]
        for word_id in span_words:
            if word_id in opinionated_words:
                opinionated_sentences.append(sent_id)
    opinionated_sentences = set(opinionated_sentences)
    return opinionated_sentences
def get_speculative_tokens(directory):
    """Takes all KAF-files from a directory and returns a list of all the speculative sentences in these files (sentence ids)."""
    speculative_terms = []
    speculative_wordids = []
    speculative_words = []
    # CHECK WHICH TERM IDS ARE SPECULATIVE IN ALL KAF-FILES FROM DIRECTORY: CREATE SPECULATIVE_TERMS LIST
    for filename in listdir(directory):
        path = directory + "\\" + filename
        try:
            root = get_root(path)
        except:
            print "Error: could not process", filename
        try:
            opinions_layer = root.find("opinions")
            for opinion in opinions_layer.findall("opinion"):
                for expression in opinion.findall("opinion_expression"):
                    polarity = expression.get("polarity")
                    if polarity == "speculation":
                        span = expression.find("span")
                        for target in span.findall("target"):
                            term = target.get("id")
                            term = filename + "_" + term
                            speculative_terms.append(term)
        except:
            print "Error: no opinion found in", filename
    # CHECK WHICH WORD IDS ARE SPECULATIVE USING THE TERMS-WORDS DICTIONARY: CREATE SPECULATIVE_WORDIDS LIST
    dict_terms_words = get_terms_words(directory)
    for term in dict_terms_words:
        if term in speculative_terms:
            word_id = dict_terms_words[term]
            speculative_wordids.append(word_id)
    # CHECK WHICH WORDS ARE PART OF SPECULATION USING THE WORDIDS-WORDS DICTIONARY: CREATE SPECULATIVE WORDS LIST
    dict_wordids_words = get_wordids_words(directory)
    for wordid in dict_wordids_words:
        word = dict_wordids_words[wordid]
        if wordid in speculative_wordids:
            speculative_words.append(word)
    speculative_words = list(set(speculative_words))
    return speculative_words