Exemplo n.º 1
0
def token_distance(token: str, other: str, metrics: Collection[Distance] = {Distance.NAIVE}) -> float:
    distance = 0.0
    token_lemma = lemmatize(token)
    other_lemma = lemmatize(other)

    if Distance.POS in metrics:
        token_pos = pos_tags([token])[0][1]
        other_pos = pos_tags([other])[0][1]
        distance += int(simplify_tag(token_pos) != simplify_tag(other_pos))
    if Distance.NAIVE in metrics:
        distance += int(token_lemma != other_lemma)
    if Distance.LENGTH in metrics:
        distance += abs(len(token_lemma) - len(other_lemma))
    if Distance.LEVENSHTEIN in metrics:
        distance += edit_distance(token_lemma, other_lemma)
    if any(d in metrics for d in {Distance.PATH, Distance.WU_PALMER, Distance.LEACOCK_CHORDOROW}):
        try:
            synset1, synset2 = wn.synsets(token_lemma)[0], wn.synsets(other_lemma)[0]
        except IndexError:
            distance += len([d in metrics for d in {Distance.PATH, Distance.WU_PALMER, Distance.LEACOCK_CHORDOROW}])
            return distance / len(metrics)
        if Distance.PATH in metrics:
            distance += 1 - wn.similarity.path(synset1, synset2)
        if Distance.WU_PALMER in metrics:
            distance += 1 - wn.similarity.wup(synset1, synset2)
        if Distance.LEACOCK_CHORDOROW in metrics:
            distance += 1 - wn.similarity.lch(synset1, synset2)

    return distance / len(metrics)
Exemplo n.º 2
0
def check_phrase_for_occupations_nobreaks(s):
    import nlp
    from occ import set2code

    found = []

    words = nlp.word_tokenize(s)
    words = [nlp.lemmatize(x) for x in words]

    sets = set()
    sets.update(
        nlp.getCloseUnorderedSets(words, minTuple=1, maxTuple=1, maxBuffer=0))
    sets.update(
        nlp.getCloseUnorderedSets(words, minTuple=2, maxTuple=2, maxBuffer=0))
    sets.update(
        nlp.getCloseUnorderedSets(words, minTuple=3, maxTuple=3, maxBuffer=0))
    sets.update(
        nlp.getCloseUnorderedSets(words, minTuple=4, maxTuple=4, maxBuffer=0))

    for fs in sets:
        if fs in set2code:
            c = set2code[fs]["code"]

            found.append({"word": " ".join(fs), "occ": [c], "fs": fs})

    def is_subset_anyone(x):
        for y in found:
            if x['fs'] != y['fs'] and x['fs'].issubset(y['fs']):
                return True

    found = [x for x in found if not is_subset_anyone(x)]

    return found
Exemplo n.º 3
0
def fetch_questions(q):
    lemmae_q = lemmatize(q)
    print(lemmae_q)
    category = 'misc'

    for x in categories:
        for y in categories[x]:
            for z in lemmae_q:
                if z == y:
                    category = x
                    print("match =>" + category)
                    break

    u1 = category  #.decode('utf-8')
    print(type(u1))

    docs = db.collection(u'questions').where(u'category', u'==', u1).get()

    questions = []
    threshold = 0.4
    for doc in docs:
        w = doc.to_dict()
        match_percent = is_match(w[u'question'], q)
        if match_percent >= threshold:
            temp = {
                'qid': w[u'qid'],
                'user': w[u'user'],
                'timestamp': w[u'timestamp'],
                'category': w[u'category'],
                'question': w[u'question']
            }
            questions.append(temp)

    print(questions)
    return (questions)
Exemplo n.º 4
0
def concept_associations(handle):
    '''
  This needs to be cultural knowledge.
  Urban dictionary?
  '''
    important_relations = ['CapableOf', 'Desires', 'RelatedTo', 'DefinedAs']

    associations = set()

    for word in str(lemmatize(handle)).lower().split(' '):
        if 'trump' in word:
            word = 'president'

        if is_common_word(word):
            continue

        try:
            word_information = c.look_up_word(word)
        except KeyError as e:
            continue

        for relation in important_relations:
            associations = associations.union([
                fact.lower().split(' ')[-1].replace('_', ' ')
                for fact in word_information
                if relation in fact and string_is_clean(fact)
            ])

    return sorted(filter_strings(list(associations)))
Exemplo n.º 5
0
def fetch_category(q):
    lemmae_q = lemmatize(q)
    print(lemmae_q)
    category = 'misc'

    for x in categories:
        for y in categories[x]:
            for z in lemmae_q:
                if z == y:
                    category = x
                    print("match =>" + category)
                    break

    return category
Exemplo n.º 6
0
def fetch_data(q):
    print(q)
    lemmae_q = lemmatize(q)

    print(lemmae_q)
    category = 'misc'
    print(type(category))
    flag = False
    for x in categories:
        for y in categories[x]:
            #print("---- "+y)
            for z in lemmae_q:
                if z == y:
                    category = x
                    print("match =>" + category)
                    flag = True
                if flag == True:
                    break
        if flag == True:
            break
    print(category)
    u1 = category  #.decode('utf-8')

    docs = db.collection('questions').where('category', '==', u1).get()

    qid = 0
    threshold = 0
    for doc in docs:
        w = doc.to_dict()
        match_percent = is_match(w[u'question'], q)
        if match_percent > threshold:
            qid = w[u'qid']
            threshold = match_percent
            print(str(qid) + " " + w[u'question'])
    final_ans = "Sorry I don't know the answer"
    upvotes = 0
    print(threshold)
    if qid != 0 and threshold >= 0.4:
        ans_docs = db.collection(u'answers').where(u'qid', u'==', qid).get()

        for doc in ans_docs:
            ans = doc.to_dict()
            if ans[u'qid'] == qid:
                if upvotes < ans[u'upvotes']:
                    upvotes = ans[u'upvotes']
                    final_ans = ans[u'answer']

    return (final_ans)
Exemplo n.º 7
0
def get_specific(sentence):
    words = sentence.lower().split(' ')

    for i, word in enumerate(words):
        word_info = c.look_up_word(lemmatize(word))

        print('word_info', word_info)

        for fact in word_info:
            first, relation, second = fact.split(' ')

            if relation == 'InstanceOf' and second == word:
                print('fact', fact)
                words[i] = first

    return ' '.join(words)
Exemplo n.º 8
0
def loadAssociations():
    global codes
    global term2code
    global set2code

    CSV_fn = path.join(path.dirname(__file__), "..", "w2c_source", "compiledCodes.csv")
    print("Loading term-code associations into variable 'codes' from %s..." % CSV_fn)
    print("Loading term dictionary into variable 'term2code' from %s..." % CSV_fn)

    with open(CSV_fn, 'r') as outCodesF:
        CSV_r = DictReader(outCodesF)
        codes = list(CSV_r)

    for code in codes:
        term2code[ code["term"] ] = code

        words = nlp.word_tokenize( code["term"] )
        words = [nlp.lemmatize(x) for x in words]
        set2code[ frozenset(words) ] = code