def token_distance(token: str, other: str, metrics: Collection[Distance] = {Distance.NAIVE}) -> float: distance = 0.0 token_lemma = lemmatize(token) other_lemma = lemmatize(other) if Distance.POS in metrics: token_pos = pos_tags([token])[0][1] other_pos = pos_tags([other])[0][1] distance += int(simplify_tag(token_pos) != simplify_tag(other_pos)) if Distance.NAIVE in metrics: distance += int(token_lemma != other_lemma) if Distance.LENGTH in metrics: distance += abs(len(token_lemma) - len(other_lemma)) if Distance.LEVENSHTEIN in metrics: distance += edit_distance(token_lemma, other_lemma) if any(d in metrics for d in {Distance.PATH, Distance.WU_PALMER, Distance.LEACOCK_CHORDOROW}): try: synset1, synset2 = wn.synsets(token_lemma)[0], wn.synsets(other_lemma)[0] except IndexError: distance += len([d in metrics for d in {Distance.PATH, Distance.WU_PALMER, Distance.LEACOCK_CHORDOROW}]) return distance / len(metrics) if Distance.PATH in metrics: distance += 1 - wn.similarity.path(synset1, synset2) if Distance.WU_PALMER in metrics: distance += 1 - wn.similarity.wup(synset1, synset2) if Distance.LEACOCK_CHORDOROW in metrics: distance += 1 - wn.similarity.lch(synset1, synset2) return distance / len(metrics)
def check_phrase_for_occupations_nobreaks(s): import nlp from occ import set2code found = [] words = nlp.word_tokenize(s) words = [nlp.lemmatize(x) for x in words] sets = set() sets.update( nlp.getCloseUnorderedSets(words, minTuple=1, maxTuple=1, maxBuffer=0)) sets.update( nlp.getCloseUnorderedSets(words, minTuple=2, maxTuple=2, maxBuffer=0)) sets.update( nlp.getCloseUnorderedSets(words, minTuple=3, maxTuple=3, maxBuffer=0)) sets.update( nlp.getCloseUnorderedSets(words, minTuple=4, maxTuple=4, maxBuffer=0)) for fs in sets: if fs in set2code: c = set2code[fs]["code"] found.append({"word": " ".join(fs), "occ": [c], "fs": fs}) def is_subset_anyone(x): for y in found: if x['fs'] != y['fs'] and x['fs'].issubset(y['fs']): return True found = [x for x in found if not is_subset_anyone(x)] return found
def fetch_questions(q): lemmae_q = lemmatize(q) print(lemmae_q) category = 'misc' for x in categories: for y in categories[x]: for z in lemmae_q: if z == y: category = x print("match =>" + category) break u1 = category #.decode('utf-8') print(type(u1)) docs = db.collection(u'questions').where(u'category', u'==', u1).get() questions = [] threshold = 0.4 for doc in docs: w = doc.to_dict() match_percent = is_match(w[u'question'], q) if match_percent >= threshold: temp = { 'qid': w[u'qid'], 'user': w[u'user'], 'timestamp': w[u'timestamp'], 'category': w[u'category'], 'question': w[u'question'] } questions.append(temp) print(questions) return (questions)
def concept_associations(handle): ''' This needs to be cultural knowledge. Urban dictionary? ''' important_relations = ['CapableOf', 'Desires', 'RelatedTo', 'DefinedAs'] associations = set() for word in str(lemmatize(handle)).lower().split(' '): if 'trump' in word: word = 'president' if is_common_word(word): continue try: word_information = c.look_up_word(word) except KeyError as e: continue for relation in important_relations: associations = associations.union([ fact.lower().split(' ')[-1].replace('_', ' ') for fact in word_information if relation in fact and string_is_clean(fact) ]) return sorted(filter_strings(list(associations)))
def fetch_category(q): lemmae_q = lemmatize(q) print(lemmae_q) category = 'misc' for x in categories: for y in categories[x]: for z in lemmae_q: if z == y: category = x print("match =>" + category) break return category
def fetch_data(q): print(q) lemmae_q = lemmatize(q) print(lemmae_q) category = 'misc' print(type(category)) flag = False for x in categories: for y in categories[x]: #print("---- "+y) for z in lemmae_q: if z == y: category = x print("match =>" + category) flag = True if flag == True: break if flag == True: break print(category) u1 = category #.decode('utf-8') docs = db.collection('questions').where('category', '==', u1).get() qid = 0 threshold = 0 for doc in docs: w = doc.to_dict() match_percent = is_match(w[u'question'], q) if match_percent > threshold: qid = w[u'qid'] threshold = match_percent print(str(qid) + " " + w[u'question']) final_ans = "Sorry I don't know the answer" upvotes = 0 print(threshold) if qid != 0 and threshold >= 0.4: ans_docs = db.collection(u'answers').where(u'qid', u'==', qid).get() for doc in ans_docs: ans = doc.to_dict() if ans[u'qid'] == qid: if upvotes < ans[u'upvotes']: upvotes = ans[u'upvotes'] final_ans = ans[u'answer'] return (final_ans)
def get_specific(sentence): words = sentence.lower().split(' ') for i, word in enumerate(words): word_info = c.look_up_word(lemmatize(word)) print('word_info', word_info) for fact in word_info: first, relation, second = fact.split(' ') if relation == 'InstanceOf' and second == word: print('fact', fact) words[i] = first return ' '.join(words)
def loadAssociations(): global codes global term2code global set2code CSV_fn = path.join(path.dirname(__file__), "..", "w2c_source", "compiledCodes.csv") print("Loading term-code associations into variable 'codes' from %s..." % CSV_fn) print("Loading term dictionary into variable 'term2code' from %s..." % CSV_fn) with open(CSV_fn, 'r') as outCodesF: CSV_r = DictReader(outCodesF) codes = list(CSV_r) for code in codes: term2code[ code["term"] ] = code words = nlp.word_tokenize( code["term"] ) words = [nlp.lemmatize(x) for x in words] set2code[ frozenset(words) ] = code