def preprocess(text, fileno): # print '{' sentences = re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", text.lower()) totalWords.append(len(sentences)) # print totalWords sentences = [ sent for sent in sentences if re.compile("^\w+").match(sent) and re.compile("^[^0-9]").match(sent) ] sentences = [ sent for sent in sentences if sent not in stop and len(sent) > 2 ] words = Counter(sentences).keys() # equals to list(set(words)) frequency = Counter(sentences).values() # counts the elements' frequency uniqueSet = {} for i in range(0, len(words)): uniqueSet[words[i]] = frequency[i] extendedResult = [] # result = [] for sent in uniqueSet: try: answer = most_frequent_sense(str(sent)) # print (str(sent)+':'+str(answer.name())+',') transformed = transform(answer.name())[0] extendedResult.append((sent, answer.name())) # result.append((sent,transformed)) features[str(transformed)] = [ item + uniqueSet[sent] if x == fileno else item for x, item in enumerate(features[str(transformed)]) ] except: pass # print '\b}' return extendedResult
def wordnet2(): to_visit_queue = Queue() identities = set() act_identities = [ x.strip() for x in open( os.path.join(IDENTITY_DICTIONARIES_LOCATION, "identities.txt")) ] to_visit_queue.put(wn.synset('person.n.01')) all_identity_terms = set() while not to_visit_queue.empty(): val = to_visit_queue.get() hypos = val.hyponyms() #if len(hypos): [to_visit_queue.put(x) for x in hypos] for lemma in val.lemmas(): all_identity_terms.add(lemma.name().lower()) mf_synset = most_frequent_sense(lemma.name()) if mf_synset == val and len(lemma.name()) > 2: #print 'yep: ', lemma.name().lower() identities.add(lemma.name().lower()) #else: # print 'nope: ', lemma.name().lower() f = open( os.path.join(IDENTITY_DICTIONARIES_LOCATION, "wordnet_identities.txt"), "w") for p in identities: f.write(p.replace("_", " ") + "\n") f.close() a_f = open("tmp/all_wordnet_identities_terms.txt", "w") for p in all_identity_terms: a_f.write(p.replace("_", " ") + "\n") a_f.close() all_person = set(wn.synset('person.n.01').closure(lambda s: s.hyponyms())) f = open(os.path.join(NON_IDENTITY_DICTIONARIES_LOCATION, "wordnet.txt"), "w") all_non_people = set() for s in list(wn.all_synsets('n')) + list(wn.all_synsets('a')): if s not in all_person and 'person' not in s.definition(): for lemma in s.lemmas(): lem_name = lemma.name().lower() if lem_name not in act_identities and lem_name not in all_identity_terms and lem_name[:-1] not in identities and 'person' not in str(lemma)\ and 'people' not in lem_name and 'police' not in lem_name and 'body' not in lem_name\ and 'girl' not in lem_name and ' boy' not in lem_name and ' man' not in lem_name and \ ' woman' not in lem_name and len(lem_name) > 2: all_non_people.add(lem_name) for lem_name in all_non_people: f.write(lem_name.replace("_", " ").lower() + "\n") f.close()
except: definition = answer.definition print "Definition:", definition print print "#TESTING first_sense() ..." print "Context:", bank_sents[0] answer = first_sense('bank') print "Sense:", answer try: definition = answer.definition() except: definition = answer.definition print "Definition:", definition print print "#TESTING most_frequent_sense() ..." print "Context:", bank_sents[0] answer = most_frequent_sense('bank') print "Sense:", answer try: definition = answer.definition() except: definition = answer.definition print "Definition:", definition print print "======== TESTING similarity ===========\n" from pywsd.similarity import max_similarity for sim_choice in ["path", "lch", "wup", "res", "jcn", "lin"]: print "Context:", bank_sents[0] print "Similarity:", sim_choice answer = max_similarity(bank_sents[0], 'bank', sim_choice, pos="n") print "Sense:", answer try: definition = answer.definition()
print "Sense:", answer definition = answer.definition() print "Definition:", definition print print "#TESTING first_sense() ..." print "Context:", bank_sents[0] answer = first_sense('bank') print "Sense:", answer definition = answer.definition() print "Definition:", definition print print "#TESTING most_frequent_sense() ..." print "Context:", bank_sents[0] answer = most_frequent_sense('bank') print "Sense:", answer definition = answer.definition() print "Definition:", definition print print "======== TESTING similarity ===========\n" from pywsd.similarity import max_similarity for sim_choice in ["path", "lch", "wup", "res", "jcn", "lin"]: print "Context:", bank_sents[0] print "Similarity:", sim_choice answer = max_similarity(bank_sents[0], 'bank', sim_choice, pos="n") print "Sense:", answer definition = answer.definition() print "Definition:", definition