Exemplo n.º 1
0
	def __init__(self, lang='en'):
		try:
			assertion_pickle = open('data/concept_assertion.pickle', 'rb')
			self.concept_assertions = pickle.load(assertion_pickle)
			assertion_pickle.close()
		except:
			self.concept_assertions = {}
		self.nlp = simplenlp.get(lang)
		self.stopwords = stopwords.words('english')
		self.stopwords.extend(concept_stopwords)
Exemplo n.º 2
0
def lookup_concept_from_nl(language, text):
    """
    Look up a concept using any natural language text that represents it.
    This function requires the :mod:`simplenlp` module
    to normalize natural language text into a raw concept name.
    """
    import simplenlp
    nltools = simplenlp.get('en')

    normalized = nltools.normalize(text)
    return lookup_concept_raw(language, normalized)
Exemplo n.º 3
0
def make_concept_uri(text, lang, disambiguation=None):
    if lang == 'en':
        from metanl import english
        normalized, disambig = english.normalize_topic(text)
    elif lang == 'ja':
        from metanl import japanese
        normalized, disambig = japanese.normalize(text), None
    elif lang in ('pt', 'hu', 'nl', 'es'):
        # languages where we know the stopword list
        import simplenlp
        nlp = simplenlp.get(lang)
        disambig = None
        normalized, disambig = nlp.normalize(text), None
    else:
        normalized = text
        disambig = None
    if disambiguation is not None:
        disambig = disambiguation
    if disambig is not None:
        disambig = disambig.replace(' ', '_')
    if disambig:
        return '/c/%s/%s/%s' % (lang, normalized.replace(' ', '_'), disambig)
    else:
        return '/c/%s/%s' % (lang, normalized.replace(' ', '_'))
Exemplo n.º 4
0
def make_concept_uri(text, lang, disambiguation=None):
    if lang == 'en':
        from metanl import english
        normalized, disambig = english.normalize_topic(text)
    elif lang == 'ja':
        from metanl import japanese
        normalized, disambig = japanese.normalize(text), None
    elif lang in ('pt', 'hu', 'nl', 'es'):
        # languages where we know the stopword list
        import simplenlp
        nlp = simplenlp.get(lang)
        disambig = None
        normalized, disambig = nlp.normalize(text), None
    else:
        normalized = text
        disambig = None
    if disambiguation is not None:
        disambig = disambiguation
    if disambig is not None:
        disambig = disambig.replace(' ', '_')
    if disambig:
        return '/c/%s/%s/%s' % (lang, normalized.replace(' ', '_'), disambig)
    else:
        return '/c/%s/%s' % (lang, normalized.replace(' ', '_'))
Exemplo n.º 5
0
import simplenlp
EN = simplenlp.get('en')
from conceptnet5.english_nlp import normalize


def check_line(line):
    parts = line.strip().split()
    norm = normalize(parts[0])
    if norm != parts[1]:
        print "Original: %s / WordNet: %s / conceptnet: %s" %\
            (parts[0], parts[1], norm)


def compare_words():
    for line in open('/Users/rspeer/nltk_data/corpora/wordnet/noun.exc'):
        check_line(line)

    for line in open('/Users/rspeer/nltk_data/corpora/wordnet/verb.exc'):
        check_line(line)


if __name__ == '__main__':
    compare_words()
Exemplo n.º 6
0
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import wordnet
import simplenlp
import re
EN = simplenlp.get('en')

try:
    morphy = wordnet.morphy
except LookupError:
    nltk.download('wordnet')
    morphy = wordnet.morphy

STOPWORDS = ['the', 'a', 'an']

EXCEPTIONS = {
    # Avoid obsolete and obscure roots, the way lexicographers don't.
    'wrought': 'wrought',   # not 'work'
    'media': 'media',       # not 'medium'
    'installed': 'install', # not 'instal'
    'installing': 'install',# not 'instal'
    'synapses': 'synapse',  # not 'synapsis'
    'soles': 'sole',        # not 'sol'
    'pubes': 'pube',        # not 'pubis'
    'dui': 'dui',           # not 'duo'
    'comics': 'comic',      # WordNet's root for this will make you nerd-rage
    'taxis': 'taxi',        # not 'taxis'
    'teeth': 'tooth',       # not 'teeth'

    # Avoid nouns that shadow more common verbs.
    'am': 'be',
Exemplo n.º 7
0
def put_raw_assertion_in_graph(raw):
    try:
        lang = raw.language_id
        if raw.frame.goodness < 1: return
        if lang.startswith('zh'): return
        polarity = raw.frame.frequency.value
        activity = raw.sentence.activity.name
        if 'rubycommons' in activity: return

        # build the assertion
        raw_arg1 = GRAPH.get_or_create_concept(lang, raw.text1)
        raw_arg2 = GRAPH.get_or_create_concept(lang, raw.text2)
        frame_text = raw.frame.text
        if polarity > 0:
            frame_text = frame_text.replace('{%}', '')
        else:
            frame_text = frame_text.replace('{%}', 'not')
        frame = GRAPH.get_or_create_frame(lang, frame_text)
        raw_assertion = GRAPH.get_or_create_assertion(
            frame,
            [raw_arg1, raw_arg2],
            {'dataset': 'conceptnet/4/'+lang, 'license': 'CC-By', 'normalized': False}
        )
        
        # create justification structure
        creator = raw.sentence.creator.username
        if creator == 'verbosity': return
        creator_node = GRAPH.get_or_create_node(
          u'/source/contributor/omcs/'+creator
        )
        activity_node = GRAPH.get_or_create_node(u'/source/activity/omcs/'+activity)
        GRAPH.justify(OMCS, activity_node)
        GRAPH.justify(OMCS, creator_node)
        conjunction = GRAPH.get_or_create_conjunction(
            [creator_node, activity_node]
        )
        GRAPH.justify(conjunction, raw_assertion)

        # make the normalized version
        if lang == 'en':
            arg1 = GRAPH.get_or_create_concept('en', en_normalize(raw.text1))
            arg2 = GRAPH.get_or_create_concept('en', en_normalize(raw.text2))
        elif lang == 'ja':
            arg1 = GRAPH.get_or_create_concept('ja', JA.normalize(raw.text1))
            arg2 = GRAPH.get_or_create_concept('ja', JA.normalize(raw.text2))
        else:
            nlp = simplenlp.get(lang)
            arg1 = GRAPH.get_or_create_concept(lang, nlp.normalize(raw.text1))
            arg2 = GRAPH.get_or_create_concept(lang, nlp.normalize(raw.text2))

        if polarity > 0:
            relation = GRAPH.get_or_create_relation(raw.frame.relation.name)
        else:
            relation = GRAPH.get_or_create_relation('Not'+raw.frame.relation.name)
        assertion = GRAPH.get_or_create_assertion(
            relation, [arg1, arg2],
            {'dataset': 'conceptnet/4/'+lang, 'license': 'CC-By', 'normalized': True}
        )
        for vote in raw.votes.all():
            voter = GRAPH.get_or_create_node(
              u'/source/contributor/omcs/'+vote.user.username
            )
            GRAPH.justify(OMCS, voter)
            GRAPH.justify(voter, raw_assertion, weight=vote.vote)

        GRAPH.derive_normalized(raw_assertion, assertion)
        print assertion
    except Exception:
        import traceback
        traceback.print_exc()
Exemplo n.º 8
0
from csc_utils.batch import queryset_foreach
from conceptnet.models import Sentence, Assertion, RawAssertion
from conceptnet5.graph import JSONWriterGraph
from conceptnet5.english_nlp import normalize as en_normalize
import simplenlp

GRAPH = JSONWriterGraph('json_data/conceptnet')

OMCS = GRAPH.get_or_create_node('/source/site/omcs')
GRAPH.justify('/', OMCS)

JA = simplenlp.get('ja')
# monkey-patch
def answer_false(*args):
    return False
JA.is_stopword_record = answer_false

def put_raw_assertion_in_graph(raw):
    try:
        lang = raw.language_id
        if raw.frame.goodness < 1: return
        if lang.startswith('zh'): return
        polarity = raw.frame.frequency.value
        activity = raw.sentence.activity.name
        if 'rubycommons' in activity: return

        # build the assertion
        raw_arg1 = GRAPH.get_or_create_concept(lang, raw.text1)
        raw_arg2 = GRAPH.get_or_create_concept(lang, raw.text2)
        frame_text = raw.frame.text
        if polarity > 0:
Exemplo n.º 9
0
import simplenlp
from metanl import english
import math, random
from luminoso3.background_space import get_commonsense_assoc
from colorizer.color_data import make_lab_color_data, lab_to_rgb, rgb_to_hsv
from colorizer.colorvote import weighted_elect_samples

ENGLISH = simplenlp.get('en')
ASSOC = get_commonsense_assoc('en', 100)


COLORDATA = {}
origdata = make_lab_color_data()

def importance_factor(colorname):
    imp = 10000 / math.sqrt(english.word_frequency(colorname.split()[0], 1000000))
    return int(imp)

for key, values in origdata.items():
    subset_values = random.sample(values,
      min(len(values), int(math.ceil(importance_factor(key)*math.sqrt(len(values))))))
    COLORDATA[key] = subset_values


def output_colors(labcolors):
    return [lab_to_rgb(c) for c in sorted(labcolors)]

class IncrementalColorizer(object):
    def __init__(self, ncolors):
        self.ncolors = ncolors
        self.colors = [(128,128,128)] * ncolors
Exemplo n.º 10
0
def test_nai():
    ja = simplenlp.get('ja')
    test_sentence = u'いいえ、分かりませんでした。'
    assert ja.normalize(test_sentence) == u'いいえ 分かる ない'
Exemplo n.º 11
0
def test_normalize():
    ja = simplenlp.get('ja')
    test_sentence = u'これはテストですが、大丈夫です。'
    assert ja.normalize_list(test_sentence) == [u'テスト', u'大丈夫']
    assert ja.normalize(test_sentence) == u'テスト 大丈夫'
Exemplo n.º 12
0
def test_chinese():
    zh = simplenlp.get('zh-Hant')
    railway = u"迪士尼线"
    assert zh.normalize(railway) == railway
Exemplo n.º 13
0
def test_normalize():
    ja = simplenlp.get('ja')
    test_sentence = u'これはテストですが、大丈夫です。'
    assert ja.normalize_list(test_sentence) == [u'テスト', u'大丈夫']
    assert ja.normalize(test_sentence) == u'テスト 大丈夫'
Exemplo n.º 14
0
def test_nai():
    ja = simplenlp.get('ja')
    test_sentence = u'いいえ、分かりませんでした。'
    assert ja.normalize(test_sentence) == u'いいえ 分かる ない'
Exemplo n.º 15
0
def test_chinese():
    zh = simplenlp.get('zh-Hant')
    railway = u"迪士尼线"
    assert zh.normalize(railway) == railway