예제 #1
0
    def nl(self):
        """
        A collection of natural language tools for a language.

        See :mod:`simplenlp` for more information on using these tools.
        """
        return get_nl(self.id)
예제 #2
0
    def nl(self):
        """
        A collection of natural language tools for a language.

        See :mod:`simplenlp` for more information on using these tools.
        """
        return get_nl(self.id)
예제 #3
0
 def __init__(self, distance_weight=0.8, negation_weight=-0.5, cutoff=0.1):
     """
     Create a reader that reads English text using `simplenlp`. You can
     optionally adjust the weights for how much various terms affect
     each other.
     """
     SimpleNLPReader.__init__(self, distance_weight, negation_weight,
                              cutoff)
     self.nl = get_nl('ja')
예제 #4
0
	def __init__(self, emoticon_file=path+'/data/emoticons.csv', \
			affect_wordnet_file=path+'/data/affectiveWNmatrix.pickle'):
		# Build emoticon dictionary
		self.emoticon = {}
		emoticon_reader = csv.reader(open(emoticon_file, 'r'))
		for emoticon, meaning in emoticon_reader:
			self.emoticon[emoticon.decode('utf-8')] = meaning
		self.emoticon_list = self.emoticon.keys()
		# Create blending of affect WordNet and ConceptNet
		cnet = conceptnet_2d_from_db('en')
		affectwn_raw = get_picklecached_thing(affect_wordnet_file)
		affectwn_normalized = affectwn_raw.normalized()
		theblend = Blend([affectwn_normalized, cnet])
		self.affectwn = theblend.svd()
		# Get natural language processing tool
		self.nl = get_nl('en')
def parser(txt):
    # append the result into "output" and return it
    # then, remove the debug code in Step 8.
    output = []
    en_nl = get_nl('en')            # specify language
    #en = Language.get('en')         # specify language

    # load articles from file
    #if os.name == 'nt':
    #    openfile = open('./in.txt', 'r')
    #if os.name == 'posix':
    #    import sys
    #    sys.path.append('/home/chsiensu/.conceptnet/nltk-2.0.1rc1')
    #    openfile = open('/home/chsiensu/.conceptnet/intext.txt', 'r')
    
    #raw = openfile.read()
    
    # input text from the web-page
    raw = txt

    '''
        raw: the original, unprocessed blog paragraph text
    '''
    tStart = time.time()
    '''
        record start time
    '''
    articleLengthCheck = 1
    print '\n===>step 1: extract_concepts'
    bigram = []
    concepts = en_nl.extract_concepts(raw, max_words=2, check_conceptnet=True)
    '''
        extract_concepts:
        Extract a list of the concepts that are directly present in text.
        max_words specifies the maximum number of words in the concept.
        If check_conceptnet is True, only concepts that are in ConceptNet for this
        language will be returned.
    '''
    if len(concepts) < 20:
        articleLengthCheck = 0
    if articleLengthCheck:
        print '=> concepts:'
        for x in concepts:
            print x
            if len(x.split()) == 2:
                bigram.append(x.split()[0]+ '_'+ x.split()[1])
                '''
                    Reform "ice cream" into "ice_cream" and push "ice_cream" onto bigram
                '''
        print '=> size(concepts):',len(concepts)
        print '\n=> bigram:'
        for x in bigram:
            print x
        print '=> size(bigram):',len(bigram)
    
        print '\n===>step 2: get Part-of-Speech(POS) tags'
        remainTags = ['NN','NNP','NNS']
        '''
            remainTags:
            Only remain tags that appear in
            ['NN','NNP','NNS']
            see Brown Corpus
            http://en.wikipedia.org/wiki/Brown_Corpus
            
            original version of remainTags:
            remainTags = ['FW','JJ','JJR','JJT','NN','NN$','NNP','NNS','NP','NP$',
            'NPS','NPS$','NR','RB','RBR','RBT']
        '''
        raw2 = en_nl.tokenize(raw)
        '''
            en_nl.tokenize(raw):
            Inserts spaces in such a way that it separates
            punctuation from words, splits up contractions
        '''
    
        tokenizedRaw = nltk.word_tokenize(raw2)
        '''
            word_tokenize:
            Tokenizers divide strings into lists of substrings
            word_tokenize divide strings into lists of words
        '''
    
        posTag = nltk.pos_tag(tokenizedRaw)
        '''
            nltk.pos_tag:
            Use NLTK's currently recommended part of speech tagger to
            tag the given list of tokens.
        '''
    
        tags = []
        count = 0
        tagDepth = math.floor(math.log(len(tokenizedRaw))+2)
        #tagDepth = 8
        print '=> (token, normalized token, tag):'
        for tag in posTag:
            '''
                posTag:
                (friends, NNS)
                (Parking, NNP)
                ...
            '''
            if tag[1] in remainTags and len(tag[0]) > 2:
                try:
                    #wnTag = wn.synset(tag[0]+'.n.01')
                    wnTag = wn.synset(en_nl.word_split(tag[0])[0].lower()+'.n.01')
                    if len(wnTag.hypernym_distances()) > tagDepth:
                        count += 1
                        stemmedTag = en_nl.word_split(tag[0])
                        print tag[0], stemmedTag[0].lower(), tag[1], len(wnTag.hypernym_distances())
                        tags.append(stemmedTag[0].lower())
                        '''
                            stemmedTag:
                            normalized tokens, for example,
                            friends -> friend
                            Parking -> park
                        '''
                except:
                    pass
        print '=> size((token, normalized token, tag)):', count
    
        print '\n===>step 3: intersecttion of ( POS tag && extract_concepts )'
        '''
            In step 3,
            1) keywords = intersection of sets from (Part-of-Speech tags) and
               (extract_concepts)
            2) Classify these keywords into categories with desired distribution 
               (the largest category should not contained almost all the
               keywords)
        '''
        intersectTags = [x for x in tags if x in concepts]
        for x in bigram:
            try:
                wn.synset(x+'.n.01')
                intersectTags.append(x)
                '''
                    append bigrams on intersectTags
                '''
            except:
                pass
        print '=> intersectTags:'
        for x in intersectTags:
            print x
        print '=> size(intersectTags):', len(intersectTags)
        intersectTagsCopy = intersectTags
        intersectTags = list(set(intersectTags))
        category = []
        for x in intersectTags:
            category.append([[x] * intersectTagsCopy.count(x)])
        i = 0
        for x in intersectTags:
            category[i] = category[i][0]
            i += 1
        '''
            category:
            The set that the occurrences of the keywords is remained.
            [['dog', 'dog', dog'],
             ['cat', 'cat']
             ...
            ]
            
            intersectTags:
            The set that the occurrences of the keywords is NOT remained.
            [['dog'],
             ['cat']
             ...
            ]
        '''
    
        iteration = 1
        threshold = 1.4
        categoryRatio = 1.0
        categoryCopy = copy.deepcopy(category)
        '''
            threshold:
            we started the threshold  from 1.4 (through trial and error) of the
            Leacock-Chodorow Similarity, two keywords that their similarity is
            below 1.4 is discarded. however, if the threshold is too low to
            appropriate classify the keywords, then we will increase threshold
            by 0.1 at next iteration.
            
            categoryRatio:
            After categorize keywords into n seperated categories c(1),c(2)...
            c(n), we calculate the ratio of the largest categories by c(1) /
            ( c(1) + c(2) + c(3) ), where c(1) is the largest category, c(2)
            is the 2nd largest category and c(3) is the 3rd largest category.
    
            If the ratio is above 0.8, that means there are too many keywords
            in c(1), so we should reduce the keywords in c(1) and increase
            keywords in c(2) and c(3) (through increase the threshold by 0.1)
            to make the top 3 largest categories more evenly distributed
    
            categoryCopy:
            For restoring the category at next iteration
            
        '''
        outerCount = 0
        innerCount = 0
        tagSimilarity = []
        for tag1 in intersectTags:
            outerCount +=1
            for tag2 in intersectTags[outerCount:]:
                try:
                    '''
                        Why use try?
                        Some words(ex: adj, adv) will incorrect classified into nouns
                        and cause an error here: (tag1+'.n.01') and (tag2+'.n.01')
                        can only deal with nouns.
                    '''
                    wnTag1 = wn.synset(tag1+'.n.01')
                    wnTag2 = wn.synset(tag2+'.n.01')
                    if wnTag1.lch_similarity(wnTag2) > threshold:
                        tagSimilarity.append([wnTag1.lch_similarity(wnTag2), tag1, tag2])
                        '''
                            lch_similarity:
                            Leacock-Chodorow Similarity, returns a score denoting how similar 
                            two word senses are, based on the shortest path that connects
                            the senses (as above) and the maximum depth of the taxonomy in
                            which the senses occur. The relationship is given as -log(p/2d)
                            where p is the shortest path length and d the taxonomy depth.
                        '''
                        innerCount +=1
                except:
                    pass
        while (categoryRatio > 0.8):
            category = copy.deepcopy(categoryCopy)
            tagSimilarity = [x for x in tagSimilarity if x[0] > threshold]
            sortedTagSimilarity = sorted(tagSimilarity, key=lambda tag: tag[0], reverse=True)
            print '\n=> sortedTagSimilarity:'
            for s in sortedTagSimilarity:
                '''
                    sortedTagSimilarity:
                    (           s[0]             ,s[1], s[2])
                    (similarity of tag1 and tag4 ,tag1, tag4) ## largest similarity
                    (similarity of tag3 and tag5 ,tag3, tag5) ## 2nd largest similarity 
                    ...
    
                    In this FOR loop, we:
                    1) Pop a set that contain s[1] from the 'categories'
                    2) Pop a set that contain s[2] from the 'categories'
                    3) Merge sets from 1) and 2) to make a bigger set, so
                       the cardinality of which is the sum of 1) and 2)
                    4) Push it back to 'categories'
                '''
                count = 0
                list1 = []
                for x in category:
                    if s[1] in x:
                        list1 = category.pop(count)
                        break
                    count += 1
                count = 0
                list2 = []
                for x in category:
                    if s[2] in x:
                        list2 = category.pop(count)
                        break
                    count += 1
                for x in list2:
                    list1.append(x)
                category.append(list1)
                print s
            print '=> size(sortedTagSimilarity):', len(sortedTagSimilarity)
            sortedCategory = []
            for a in category:
                sortedCategory.append([len(a),a])
            sortedCategory = sorted(sortedCategory, key=lambda tag: tag[0], reverse=True)
            categorySum = sortedCategory[0][0] + sortedCategory[1][0] + sortedCategory[2][0]
            categoryRatio = float(sortedCategory[0][0]) / categorySum
    
            print '\n=> category:'
            for x in category:
                print x
            print '=> number of category           : ', len(category)
            print '=> threshold                    : ', threshold
            print '=> size of largest category     : ', sortedCategory[0][0]
            print '=> size of 2nd largest category : ', sortedCategory[1][0]
            print '=> size of 3rd largest category : ', sortedCategory[2][0]
            print '=> categoryRatio                : ', categoryRatio
            print '=> End of iteration             : ', iteration
            print '=> ' * 10
            iteration += 1
            threshold += 0.1
    
    
        print '\n===>step 4: category prediction'
        '''
            Find similar concepts of top largest 3 categories:
            *sortedCategory[0][1] (at most 4 concepts)
            *sortedCategory[1][1] (at most 4 concepts)
            *sortedCategory[2][1] (at most 2 concepts)
    
            Uniformity is also concerned. For example, if one category is
            ['dog', 'dog', 'dog', 'dog', 'cat', 'cat', 'cat', 'cat'],
            then at most 2 concepts will extract from this category,
            even if it has 8 elements
            
        '''
        category0 = divisi2.category(*sortedCategory[0][1])
        category1 = divisi2.category(*sortedCategory[1][1])
        category2 = divisi2.category(*sortedCategory[2][1])
        cnet= divisi2.network.conceptnet_matrix('en')
        '''
            reconstruct similarity matrix U*(Sigma^2)*(U^T)
        '''
        concept_axes, axis_weights, feature_axes= cnet.svd(k=100)
        sim = divisi2.reconstruct_similarity(concept_axes, axis_weights, post_normalize=True)
        category0_top4 = sim.left_category(category0).top_items(n=4)
        category1_top4 = sim.left_category(category1).top_items(n=4)
        category2_top2 = sim.left_category(category2).top_items(n=2)
        outputTemp = []
        uniformity0 = len(set(sortedCategory[0][1]))
        uniformity1 = len(set(sortedCategory[1][1]))
        uniformity2 = len(set(sortedCategory[2][1]))
        print '=> category0:'
        for x in category0_top4[ : min(uniformity0 , 4)]:
            outputTemp.append(x)
            print x
        print '=> category1:'
        for x in category1_top4[ : min(uniformity1 , 4)]:
            outputTemp.append(x)
            print x
        print '=> category2:'
        for x in category2_top2[ : min(uniformity2 , 2)]:
            outputTemp.append(x)
            print x
    
        print '\n===>step 5: output file and calculate execution time'
        '''
            output = ['keyword1','keyword2',...]
        '''
        print '=> statistics     :'
        print '=> words count    : ', len(tokenizedRaw)
        print '=> # of concepts  : ', len(concepts)
        print '=> # of tags      : ', len(tags)
        print '=> # of category  : ', len(category)
        output = []
        print '\n=> output:'
        for x in outputTemp:
            print x[0]
            output.append(x[0])
        tStop = time.time()
        '''
            record stop time
        '''
        print '\n=> execution time: ',(tStop - tStart), 'secs'
    else:
        output = 'The article is too short for me to extract concept'
        print output
        output = []

    return output
예제 #6
0
    # load a brill tagger trained by nltk-trainer
    # https://github.com/japerk/nltk-trainer
    #tagger = pickle.load("/Users/nathan/nltk_data/taggers/treebank_brill_aubt.pickle")
    # apply part of speech tags to the tokens
    pos_tokens = nltk.pos_tag(tokens)

    # for rhyming poetry, split final word from line:
    front_tokens, end_tokens = split_final_word_from_line(pos_tokens)

    #print "labeled tokens..."

    #replace all adjectives with a synonym
    #adj_replaced_tokens = [replace_adjectives_strip_pos(x) for x in pos_tokens]
    noun_replaced_tokens = [replace_nouns_strip_pos(x) for x in front_tokens]
    #print "replaced nouns..."
    sys.stdout.flush()

    #untokenize the text to create a single string. Clean up some of the dashes, which confuse reporting script
    en_nl = get_nl('en')
    #replaced_text = en_nl.untokenize(" ".join(adj_replaced_tokens))#.replace(".",".\n")
    replaced_text = en_nl.untokenize(" ".join(noun_replaced_tokens +
                                              [x[0] for x in end_tokens]))
    #print sys.stdout.write(".")
    sys.stdout.flush()

    # write modified literature to file
    f.write("<pre>\n")
    f.write(replaced_text + "\n")
    f.write("</pre>\n")
f.close()
예제 #7
0
def test_english():
    english = luminoso2.make_english(TEMPDIR+'/english')
    assert english.assoc.entry_named('cat', 'dog') > 0.5
    err1 = english.learn_assoc(1, 'foo', 'bar')
    assoc1 = english.assoc.entry_named('foo', 'bar')
    err2 = english.learn_assoc(1, 'foo', 'bar')
    assoc2 = english.assoc.entry_named('foo', 'bar')
    # after seeing the same example twice, error should decrease
    assert err2 < err1
    # after seeing the same example twice, association should increase
    assert assoc2 > assoc1

if __name__ == '__main__':
    import cProfile
    import simplenlp
    en = simplenlp.get_nl('en')
    en.lemma_split('test')
    en.is_stopword('test')
    setup_module()
    model = LuminosoModel.make_empty(
        TEMPDIR + '/testdocs',
        {
            'num_concepts': 5,
            'num_axes': 2,
            'iteration': 0,
            'reader': 'simplenlp.en'
        }
    )
    cProfile.run('for i in xrange(10): model.learn_from_url("TestDocuments")', sort=2)
    #model = LuminosoModel('../models/PLDBStudy_test3')
    #cProfile.run("model.learn_from_url('../models/PLDBStudy/Documents')", sort=2)
예제 #8
0
#! /usr/bin/env python

import MicrosoftNgram
#from conceptnet.models import *
from simplenlp import get_nl

en_nl = get_nl('en')


s = MicrosoftNgram.LookupService(model='urn:ngram:bing-body:apr10:5')


sent = "I have had a pretty crazy weekend"
ans = en_nl.lemma_split(sent)


print ans
예제 #9
0
def doctest_globals():
    en_nl = get_nl('en')
    return locals()
import nltk
import re


from simplenlp import get_nl
en = get_nl('en')

IN = re.compile(r'.*\bin\b')
term = ""

class doc():
    pass

doc.headline = ['not applicable']

source_text = """
Charles is living in North Dakota. Hubert is visiting in New York. 
George was born in Great Britain. Hubert is visiting in Mexico City. I like cheese.
"""
question = "Where was George Washington born?"

taggedtokens = nltk.pos_tag(nltk.word_tokenize(question))
taggedtokens = nltk.pos_tag(nltk.word_tokenize(source_text))

def process_source(source_text):
    tokens = []
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(source_text))):
        print chunk
        if hasattr(chunk, 'node'):
            if chunk.node != 'GPE':
                tmp_tree = nltk.Tree(chunk.node, [(' '.join(c[0] for c in chunk.leaves()))])
예제 #11
0
def parser(txt):
    # append the result into "output" and return it
    # then, remove the debug code in Step 8.
    output = []
    en_nl = get_nl('en')  # specify language
    #en = Language.get('en')         # specify language

    # load articles from file
    #if os.name == 'nt':
    #    openfile = open('./in.txt', 'r')
    #if os.name == 'posix':
    #    import sys
    #    sys.path.append('/home/chsiensu/.conceptnet/nltk-2.0.1rc1')
    #    openfile = open('/home/chsiensu/.conceptnet/intext.txt', 'r')

    #raw = openfile.read()

    # input text from the web-page
    raw = txt
    '''
        raw: the original, unprocessed blog paragraph text
    '''
    tStart = time.time()
    '''
        record start time
    '''
    articleLengthCheck = 1
    print '\n===>step 1: extract_concepts'
    bigram = []
    concepts = en_nl.extract_concepts(raw, max_words=2, check_conceptnet=True)
    '''
        extract_concepts:
        Extract a list of the concepts that are directly present in text.
        max_words specifies the maximum number of words in the concept.
        If check_conceptnet is True, only concepts that are in ConceptNet for this
        language will be returned.
    '''
    if len(concepts) < 20:
        articleLengthCheck = 0
    if articleLengthCheck:
        print '=> concepts:'
        for x in concepts:
            print x
            if len(x.split()) == 2:
                bigram.append(x.split()[0] + '_' + x.split()[1])
                '''
                    Reform "ice cream" into "ice_cream" and push "ice_cream" onto bigram
                '''
        print '=> size(concepts):', len(concepts)
        print '\n=> bigram:'
        for x in bigram:
            print x
        print '=> size(bigram):', len(bigram)

        print '\n===>step 2: get Part-of-Speech(POS) tags'
        remainTags = ['NN', 'NNP', 'NNS']
        '''
            remainTags:
            Only remain tags that appear in
            ['NN','NNP','NNS']
            see Brown Corpus
            http://en.wikipedia.org/wiki/Brown_Corpus
            
            original version of remainTags:
            remainTags = ['FW','JJ','JJR','JJT','NN','NN$','NNP','NNS','NP','NP$',
            'NPS','NPS$','NR','RB','RBR','RBT']
        '''
        raw2 = en_nl.tokenize(raw)
        '''
            en_nl.tokenize(raw):
            Inserts spaces in such a way that it separates
            punctuation from words, splits up contractions
        '''

        tokenizedRaw = nltk.word_tokenize(raw2)
        '''
            word_tokenize:
            Tokenizers divide strings into lists of substrings
            word_tokenize divide strings into lists of words
        '''

        posTag = nltk.pos_tag(tokenizedRaw)
        '''
            nltk.pos_tag:
            Use NLTK's currently recommended part of speech tagger to
            tag the given list of tokens.
        '''

        tags = []
        count = 0
        tagDepth = math.floor(math.log(len(tokenizedRaw)) + 2)
        #tagDepth = 8
        print '=> (token, normalized token, tag):'
        for tag in posTag:
            '''
                posTag:
                (friends, NNS)
                (Parking, NNP)
                ...
            '''
            if tag[1] in remainTags and len(tag[0]) > 2:
                try:
                    #wnTag = wn.synset(tag[0]+'.n.01')
                    wnTag = wn.synset(
                        en_nl.word_split(tag[0])[0].lower() + '.n.01')
                    if len(wnTag.hypernym_distances()) > tagDepth:
                        count += 1
                        stemmedTag = en_nl.word_split(tag[0])
                        print tag[0], stemmedTag[0].lower(), tag[1], len(
                            wnTag.hypernym_distances())
                        tags.append(stemmedTag[0].lower())
                        '''
                            stemmedTag:
                            normalized tokens, for example,
                            friends -> friend
                            Parking -> park
                        '''
                except:
                    pass
        print '=> size((token, normalized token, tag)):', count

        print '\n===>step 3: intersecttion of ( POS tag && extract_concepts )'
        '''
            In step 3,
            1) keywords = intersection of sets from (Part-of-Speech tags) and
               (extract_concepts)
            2) Classify these keywords into categories with desired distribution 
               (the largest category should not contained almost all the
               keywords)
        '''
        intersectTags = [x for x in tags if x in concepts]
        for x in bigram:
            try:
                wn.synset(x + '.n.01')
                intersectTags.append(x)
                '''
                    append bigrams on intersectTags
                '''
            except:
                pass
        print '=> intersectTags:'
        for x in intersectTags:
            print x
        print '=> size(intersectTags):', len(intersectTags)
        intersectTagsCopy = intersectTags
        intersectTags = list(set(intersectTags))
        category = []
        for x in intersectTags:
            category.append([[x] * intersectTagsCopy.count(x)])
        i = 0
        for x in intersectTags:
            category[i] = category[i][0]
            i += 1
        '''
            category:
            The set that the occurrences of the keywords is remained.
            [['dog', 'dog', dog'],
             ['cat', 'cat']
             ...
            ]
            
            intersectTags:
            The set that the occurrences of the keywords is NOT remained.
            [['dog'],
             ['cat']
             ...
            ]
        '''

        iteration = 1
        threshold = 1.4
        categoryRatio = 1.0
        categoryCopy = copy.deepcopy(category)
        '''
            threshold:
            we started the threshold  from 1.4 (through trial and error) of the
            Leacock-Chodorow Similarity, two keywords that their similarity is
            below 1.4 is discarded. however, if the threshold is too low to
            appropriate classify the keywords, then we will increase threshold
            by 0.1 at next iteration.
            
            categoryRatio:
            After categorize keywords into n seperated categories c(1),c(2)...
            c(n), we calculate the ratio of the largest categories by c(1) /
            ( c(1) + c(2) + c(3) ), where c(1) is the largest category, c(2)
            is the 2nd largest category and c(3) is the 3rd largest category.
    
            If the ratio is above 0.8, that means there are too many keywords
            in c(1), so we should reduce the keywords in c(1) and increase
            keywords in c(2) and c(3) (through increase the threshold by 0.1)
            to make the top 3 largest categories more evenly distributed
    
            categoryCopy:
            For restoring the category at next iteration
            
        '''
        outerCount = 0
        innerCount = 0
        tagSimilarity = []
        for tag1 in intersectTags:
            outerCount += 1
            for tag2 in intersectTags[outerCount:]:
                try:
                    '''
                        Why use try?
                        Some words(ex: adj, adv) will incorrect classified into nouns
                        and cause an error here: (tag1+'.n.01') and (tag2+'.n.01')
                        can only deal with nouns.
                    '''
                    wnTag1 = wn.synset(tag1 + '.n.01')
                    wnTag2 = wn.synset(tag2 + '.n.01')
                    if wnTag1.lch_similarity(wnTag2) > threshold:
                        tagSimilarity.append(
                            [wnTag1.lch_similarity(wnTag2), tag1, tag2])
                        '''
                            lch_similarity:
                            Leacock-Chodorow Similarity, returns a score denoting how similar 
                            two word senses are, based on the shortest path that connects
                            the senses (as above) and the maximum depth of the taxonomy in
                            which the senses occur. The relationship is given as -log(p/2d)
                            where p is the shortest path length and d the taxonomy depth.
                        '''
                        innerCount += 1
                except:
                    pass
        while (categoryRatio > 0.8):
            category = copy.deepcopy(categoryCopy)
            tagSimilarity = [x for x in tagSimilarity if x[0] > threshold]
            sortedTagSimilarity = sorted(tagSimilarity,
                                         key=lambda tag: tag[0],
                                         reverse=True)
            print '\n=> sortedTagSimilarity:'
            for s in sortedTagSimilarity:
                '''
                    sortedTagSimilarity:
                    (           s[0]             ,s[1], s[2])
                    (similarity of tag1 and tag4 ,tag1, tag4) ## largest similarity
                    (similarity of tag3 and tag5 ,tag3, tag5) ## 2nd largest similarity 
                    ...
    
                    In this FOR loop, we:
                    1) Pop a set that contain s[1] from the 'categories'
                    2) Pop a set that contain s[2] from the 'categories'
                    3) Merge sets from 1) and 2) to make a bigger set, so
                       the cardinality of which is the sum of 1) and 2)
                    4) Push it back to 'categories'
                '''
                count = 0
                list1 = []
                for x in category:
                    if s[1] in x:
                        list1 = category.pop(count)
                        break
                    count += 1
                count = 0
                list2 = []
                for x in category:
                    if s[2] in x:
                        list2 = category.pop(count)
                        break
                    count += 1
                for x in list2:
                    list1.append(x)
                category.append(list1)
                print s
            print '=> size(sortedTagSimilarity):', len(sortedTagSimilarity)
            sortedCategory = []
            for a in category:
                sortedCategory.append([len(a), a])
            sortedCategory = sorted(sortedCategory,
                                    key=lambda tag: tag[0],
                                    reverse=True)
            categorySum = sortedCategory[0][0] + sortedCategory[1][
                0] + sortedCategory[2][0]
            categoryRatio = float(sortedCategory[0][0]) / categorySum

            print '\n=> category:'
            for x in category:
                print x
            print '=> number of category           : ', len(category)
            print '=> threshold                    : ', threshold
            print '=> size of largest category     : ', sortedCategory[0][0]
            print '=> size of 2nd largest category : ', sortedCategory[1][0]
            print '=> size of 3rd largest category : ', sortedCategory[2][0]
            print '=> categoryRatio                : ', categoryRatio
            print '=> End of iteration             : ', iteration
            print '=> ' * 10
            iteration += 1
            threshold += 0.1

        print '\n===>step 4: category prediction'
        '''
            Find similar concepts of top largest 3 categories:
            *sortedCategory[0][1] (at most 4 concepts)
            *sortedCategory[1][1] (at most 4 concepts)
            *sortedCategory[2][1] (at most 2 concepts)
    
            Uniformity is also concerned. For example, if one category is
            ['dog', 'dog', 'dog', 'dog', 'cat', 'cat', 'cat', 'cat'],
            then at most 2 concepts will extract from this category,
            even if it has 8 elements
            
        '''
        category0 = divisi2.category(*sortedCategory[0][1])
        category1 = divisi2.category(*sortedCategory[1][1])
        category2 = divisi2.category(*sortedCategory[2][1])
        cnet = divisi2.network.conceptnet_matrix('en')
        '''
            reconstruct similarity matrix U*(Sigma^2)*(U^T)
        '''
        concept_axes, axis_weights, feature_axes = cnet.svd(k=100)
        sim = divisi2.reconstruct_similarity(concept_axes,
                                             axis_weights,
                                             post_normalize=True)
        category0_top4 = sim.left_category(category0).top_items(n=4)
        category1_top4 = sim.left_category(category1).top_items(n=4)
        category2_top2 = sim.left_category(category2).top_items(n=2)
        outputTemp = []
        uniformity0 = len(set(sortedCategory[0][1]))
        uniformity1 = len(set(sortedCategory[1][1]))
        uniformity2 = len(set(sortedCategory[2][1]))
        print '=> category0:'
        for x in category0_top4[:min(uniformity0, 4)]:
            outputTemp.append(x)
            print x
        print '=> category1:'
        for x in category1_top4[:min(uniformity1, 4)]:
            outputTemp.append(x)
            print x
        print '=> category2:'
        for x in category2_top2[:min(uniformity2, 2)]:
            outputTemp.append(x)
            print x

        print '\n===>step 5: output file and calculate execution time'
        '''
            output = ['keyword1','keyword2',...]
        '''
        print '=> statistics     :'
        print '=> words count    : ', len(tokenizedRaw)
        print '=> # of concepts  : ', len(concepts)
        print '=> # of tags      : ', len(tags)
        print '=> # of category  : ', len(category)
        output = []
        print '\n=> output:'
        for x in outputTemp:
            print x[0]
            output.append(x[0])
        tStop = time.time()
        '''
            record stop time
        '''
        print '\n=> execution time: ', (tStop - tStart), 'secs'
    else:
        output = 'The article is too short for me to extract concept'
        print output
        output = []

    return output
예제 #12
0
def doctest_globals():
    en_nl = get_nl("en")
    return locals()
예제 #13
0
def doctest_globals():
    en_nl = get_nl('en')
    return locals()