def get_speculative_lemmas(directory): speculative_terms = [] speculative_wordids = [] speculative_lemmas = [] # CHECK WHICH TERM IDS ARE SPECULATIVE IN ALL KAF-FILES FROM DIRECTORY: CREATE SPECULATIVE_TERMS LIST for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print "Error: could not process", filename try: opinions_layer = root.find("opinions") for opinion in opinions_layer.findall("opinion"): for expression in opinion.findall("opinion_expression"): polarity = expression.get("polarity") if polarity == "speculation": span = expression.find("span") for target in span.findall("target"): term = target.get("id") term = filename + "_" + term speculative_terms.append(term) except: print "Error: no opinion found in", filename # CHECK WHICH WORD IDS ARE SPECULATIVE USING THE TERMS-WORDS DICTIONARY: CREATE SPECULATIVE_WORDIDS LIST dict_terms_lemmas = get_terms_lemmas(directory) for term in dict_terms_lemmas: lemma = dict_terms_lemmas[term] if term in speculative_terms: speculative_lemmas.append(lemma) speculative_lemmas = list(set(speculative_lemmas)) return speculative_lemmas
def get_lemmas_per_sentence(directory): #Takes all KAF-files in a directory as input and returns a dictionary with the sentence id as key and a list of the tokens as value n = 0 dict_sentences_lemmas = {} for filename in listdir(directory): n += 1 print "Processing file " + str(n) + "out of 504..." path = directory + "\\" + filename try: root = get_root(path) except: print 'Error: could not process', filename text_layer = root.find("text") for wf_element in text_layer.findall("wf"): sent_id = wf_element.get("sent") word_id = wf_element.get("wid") sent_id = filename + "_s" + sent_id term_layer = root.find("terms") for term in term_layer.findall("term"): lemma = term.get("lemma") span_layer = term.find("span") for target in span_layer.findall("target"): if target.get("id") == word_id: if sent_id not in dict_sentences_lemmas: dict_sentences_lemmas[sent_id] = [lemma] else: dict_sentences_lemmas[sent_id].append(lemma) return dict_sentences_lemmas
def get_all_tokens(directory): #Takes all KAF-files in a directory as input and returns a set of all tokens in these files. all_tokens = [] for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print 'Error: could not process', filename tokens = get_words(root) for token in tokens: if not token in all_tokens: all_tokens.append(token) return all_tokens
def get_all_lemmas(directory): # Takes all KAF-files in a directory as input and returns a set of all tokens in these files. outfile = open("test_tokens.txt", "w") all_lemmas = [] for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print "Error: could not process", filename term_layer = root.find("terms") for term in term_layer.findall("term"): lemma = term.get("lemma") if lemma not in all_lemmas: all_lemmas.append(lemma) return all_lemmas
def get_sentence_dictionary(directory): # Takes all KAF-files from a directory and returns a dictionary with all the sentence ids as key and an empty list as value dict_sentences = {} for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print 'Error: could not process', filename text_element = root.find("text") for wf_element in text_element.findall("wf"): sent_id = wf_element.get("sent") sent_id = filename + "_s" + sent_id if sent_id not in dict_sentences: dict_sentences[sent_id] = [] return dict_sentences
def get_terms_lemmas(directory): # Takes all KAF-files in a directory as input and returns a dictionary with the term id as key and the lemma as value dict_terms_lemmas = {} for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print "Error: could not process", filename terms_layer = root.find("terms") for term in terms_layer.findall("term"): term_id = term.get("tid") term_id = filename + "_" + term_id term_lemma = term.get("lemma") dict_terms_lemmas[term_id] = term_lemma return dict_terms_lemmas
def get_wordids_words(directory): # Takes all KAF-files in a directory as input and returns a dictionary with the word id as key and the word itself as value dict_wordids_words = {} for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print "Error: could not process", filename text_layer = root.find("text") for wf_element in text_layer.findall("wf"): word_id = wf_element.get("wid") word_id = filename + "_" + word_id word = wf_element.text dict_wordids_words[word_id] = word return dict_wordids_words
def get_hedges_per_sentence(directory): """"Takes all KAF-files in a directory as input and returns a dictionary with all the sentences that contain a hedge cue, with the sentence id as key and a list of the hedge cues""" hedge_cues = ["possible", "possibly", "probably", "probable", "might", "may", "possibility", "probability", "presume", "suppose", "suggest", "believe", "think", "if", "perhaps", "maybe", "likely", "could", "speculate", "speculation", "suspect", "guess", "predict", "prediction", "whether", "improbable", "seem", "question", "indicate", "indication", "unsure", "allegedly", "apparently", "favor", "unlikely", "doubt", "assume", "assumption", "certainty", "uncertainty", "certain", 'uncertain', 'questionable', 'ambivalent', 'dubious', 'erratic', 'hazy', 'hesitant', 'insecure', 'unclear', 'undetermined', 'unpredictable', 'predictable', 'unsure', 'speculative', 'indefinite', 'indeterminate', 'doubtful', 'disbelieve', 'potential', 'potentially', 'imaginable', 'fear', 'hope', 'thinkable', 'promising', 'promise', 'hopeful', 'hopefully', 'feasible', 'feasibly', 'reasonably', 'conceivably', 'reasonable', 'conceivable', 'perchance', 'imaginably', 'presumably', 'seemingly', 'assumable', 'expect', 'expectation', 'expectedly', 'consider'] dict_hedges_per_sentence = {} hedge_terms = {} hedge_words = {} hedges_in_sentences = {} # CHECK WHICH TERMS ARE HEDGE CUES for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print 'Error: could not process', filename terms_layer = root.find("terms") for term in terms_layer.findall("term"): term_id = term.get("tid") term_id = filename + "_" + term_id lemma = term.get("lemma") if lemma in hedge_cues: hedge_terms[term_id] = lemma #print hedge_terms # CHECK WHICH WORDS ARE HEDGE CUES dict_terms_words = get_terms_words(directory) for term in dict_terms_words: if term in hedge_terms: word_id = dict_terms_words[term] lemma = hedge_terms[term] hedge_words[word_id] = lemma # CHECK WHICH HEDGE CUES EACH SENTENCE CONTAINS AND CREATE DICTIONARY dict_sentences_words = get_sentences_words(directory) for sent_id in dict_sentences_words: span_words = dict_sentences_words[sent_id] for word_id in span_words: if word_id in hedge_words: lemma = hedge_words[word_id] if sent_id not in hedges_in_sentences: hedges_in_sentences[sent_id] = [lemma] else: hedges_in_sentences[sent_id].append(lemma) if sent_id not in hedges_in_sentences: hedges_in_sentences[sent_id] = "No hedge cues" return hedges_in_sentences
def get_tokens_per_sentence(directory): #Takes all KAF-files in a directory as input and returns a dictionary with the sentence id as key and a list of the tokens as value dict_sentences_tokens = {} for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print 'Error: could not process', filename text_layer = root.find("text") for wf_element in text_layer.findall("wf"): sent_id = wf_element.get("sent") sent_id = filename + "_s" + sent_id token = wf_element.text if sent_id not in dict_sentences_tokens: dict_sentences_tokens[sent_id] = [token] else: dict_sentences_tokens[sent_id].append(token) return dict_sentences_tokens
def get_terms_words(directory): # Takes all KAF-files in a directory as input and returns a dictionary with the term id as key and the word id as value dict_terms_words = {} for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print "Error: could not process", filename terms_layer = root.find("terms") for term in terms_layer.findall("term"): term_id = term.get("tid") term_id = filename + "_" + term_id for span in term: for target in span.findall("target"): word_id = target.get("id") word_id = filename + "_" + word_id dict_terms_words[term_id] = word_id return dict_terms_words
def get_sentences_words(directory): # Takes all KAF-files in a directory as input and returns a dictionary with the sentence id as key and a list of the word ids as value dict_sentences_words = {} for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print "Error: could not process", filename text_layer = root.find("text") for wf_element in text_layer.findall("wf"): sent_id = wf_element.get("sent") sent_id = filename + "_s" + sent_id word_id = wf_element.get("wid") word_id = filename + "_" + word_id if sent_id not in dict_sentences_words: dict_sentences_words[sent_id] = [word_id] else: dict_sentences_words[sent_id].append(word_id) return dict_sentences_words
def get_opinionated_sentences(directory): """Takes all KAF-files from a directory and returns a set of all the speculative sentences in these files (sentence ids).""" opinionated_terms = [] opinionated_words = [] opinionated_sentences = [] # CHECK WHICH TERMS ARE SPECULATIVE IN ALL KAF-FILES FROM DIRECTORY: CREATE SPECULATIVE_TERMS LIST print "Collecting opinionated terms" for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print "Error: could not process", filename try: opinions_layer = root.find("opinions") for opinion in opinions_layer.findall("opinion"): for expression in opinion.findall("opinion_expression"): span = expression.find("span") for target in span.findall("target"): term = target.get("id") term = filename + "_" + term opinionated_terms.append(term) except: print "Error: no opinion found in", filename # CHECK WHICH WORDS ARE SPECULATIVE USING THE TERMS-WORDS DICTIONARY: CREATE SPECULATIVE_WORDS LIST print "Collecting opinionated words" dict_terms_words = get_terms_words(directory) for term in dict_terms_words: if term in opinionated_terms: word_id = dict_terms_words[term] opinionated_words.append(word_id) # CHECK WHICH SENTENCES ARE SPECULATIVE USING THE SENTENCES-WORDS DICTIONARY: ADD TO SPECULATIVE_SENTENCES SET print "Collecting opinionated sentences" dict_sentences_words = get_sentences_words(directory) for sent_id in dict_sentences_words: span_words = dict_sentences_words[sent_id] for word_id in span_words: if word_id in opinionated_words: opinionated_sentences.append(sent_id) opinionated_sentences = set(opinionated_sentences) return opinionated_sentences
def get_speculative_tokens(directory): """Takes all KAF-files from a directory and returns a list of all the speculative sentences in these files (sentence ids).""" speculative_terms = [] speculative_wordids = [] speculative_words = [] # CHECK WHICH TERM IDS ARE SPECULATIVE IN ALL KAF-FILES FROM DIRECTORY: CREATE SPECULATIVE_TERMS LIST for filename in listdir(directory): path = directory + "\\" + filename try: root = get_root(path) except: print "Error: could not process", filename try: opinions_layer = root.find("opinions") for opinion in opinions_layer.findall("opinion"): for expression in opinion.findall("opinion_expression"): polarity = expression.get("polarity") if polarity == "speculation": span = expression.find("span") for target in span.findall("target"): term = target.get("id") term = filename + "_" + term speculative_terms.append(term) except: print "Error: no opinion found in", filename # CHECK WHICH WORD IDS ARE SPECULATIVE USING THE TERMS-WORDS DICTIONARY: CREATE SPECULATIVE_WORDIDS LIST dict_terms_words = get_terms_words(directory) for term in dict_terms_words: if term in speculative_terms: word_id = dict_terms_words[term] speculative_wordids.append(word_id) # CHECK WHICH WORDS ARE PART OF SPECULATION USING THE WORDIDS-WORDS DICTIONARY: CREATE SPECULATIVE WORDS LIST dict_wordids_words = get_wordids_words(directory) for wordid in dict_wordids_words: word = dict_wordids_words[wordid] if wordid in speculative_wordids: speculative_words.append(word) speculative_words = list(set(speculative_words)) return speculative_words