def __init__(self):
        # get the original sparce matrix
        self.__conceptnet = divisi2.load("/opt/work/emotion_analysis/data_source/conceptnet_en.pickle")

        # Get the matrix after svd
        self.__concept_axes, self.__axis_weights, self.__feature_axes = self.__conceptnet.svd(k=100)

        # Get the similarity operator
        self.__sim = divisi2.reconstruct_similarity(self.__concept_axes, self.__axis_weights, post_normalize=True)
    def __init__(self):
        #get the original sparce matrix
        self.__conceptnet = divisi2.load(
            '/opt/work/emotion_analysis/data_source/conceptnet_en.pickle')

        #Get the matrix after svd
        self.__concept_axes, self.__axis_weights, self.__feature_axes = self.__conceptnet.svd(
            k=100)

        #Get the similarity operator
        self.__sim = divisi2.reconstruct_similarity(self.__concept_axes,
                                                    self.__axis_weights,
                                                    post_normalize=True)
Exemplo n.º 3
0
	def __init__(self, matrix_path=data_path+'feature_matrix_zh.smat'):
		# AnalogySpace
		A = divisi2.load(matrix_path)
		self.A = A.normalize_all()
		self.concept_axes, axis_weights, self.feature_axes = self.A.svd(k=100)
		self.sim = divisi2.reconstruct_similarity(\
				self.concept_axes, axis_weights, post_normalize=False)
		self.predict = divisi2.reconstruct(\
				self.concept_axes, axis_weights, self.feature_axes)
		# Fast spreading activation
		assoc = divisi2.load(data_path+'assoc_matrix_zh.smat')
		self.assoc = assoc.normalize_all()
		U, S, _ = self.assoc.svd(k=100)
		self.spread = divisi2.reconstruct_activation(U, S)
Exemplo n.º 4
0
 def create(self, U, S, post_normalize=False):
     self._matrix = reconstruct_similarity(U,
                                           S,
                                           post_normalize=post_normalize)
Exemplo n.º 5
0
from csc.nl import get_nl
import itertools as it
import divisi2

en_nl = get_nl('en')

A = divisi2.network.conceptnet_matrix('en')
concept_axes, axis_weights, feature_axes = A.normalize_all().svd(k=100)
sim = divisi2.reconstruct_similarity(concept_axes, axis_weights, post_normalize=True)

cheese_text = "Cheese is a type of food. It is made from milk. There are many types of cheese. Many things affect the style, texture and flavor of a cheese. These include the origin of the milk, if the milk has been pasteurized, the amount of butterfat, bacteria and mold in the cheese, how the cheese is made and how old the cheese is. For some cheeses, the milk is curdled by adding acids such as vinegar or lemon juice. Most cheeses are acidified by bacteria. This bacteria turns milk sugars into lactic acid. Rennet is then used to finish the curdling. Vegetarian alternatives to rennet can also be used. Most of these are made by fermentation of a fungus called Mucor miehei. Other alternatives us species of the Cynara thistle family. People have been making cheese since before history was written down. It is not known when cheese was first made. It is known that cheese was eaten by the Sumerians in about 4000 BC. Cheese is usually made using milk. The milk of cows, goats, and sheep are most popular. Buffalo, camel and even mare's milk can also be used. Cheese makers usually cook the milk in large pots. They add salt and a substance from the stomach of young cows called rennet. This curdles the cheese and makes it solid. Some makers do not add rennet. They curdle the cheese in other ways. Cheese made in factories is often curdled by using bacteria. Other ingredients are added and the cheese is usually aged for a short time."
cheese_text_list = cheese_text.split('.')


def extract_concepts(sentence):
    return en_nl.extract_concepts(sentence, max_words=1, check_conceptnet=True)


def find_sim_words(word1, word2):
    try:
        similarity = sim.entry_named(word1, word2)
        return similarity
    except KeyError, err:
        print "Key not found: {0}".format(str(err))


def sentence_sim(concepts):
    pairs = list(it.product(*concepts))
    similarity = 0
    for pair in pairs:
        try:
def parser(txt):
    # append the result into "output" and return it
    # then, remove the debug code in Step 8.
    output = []
    en_nl = get_nl('en')            # specify language
    #en = Language.get('en')         # specify language

    # load articles from file
    #if os.name == 'nt':
    #    openfile = open('./in.txt', 'r')
    #if os.name == 'posix':
    #    import sys
    #    sys.path.append('/home/chsiensu/.conceptnet/nltk-2.0.1rc1')
    #    openfile = open('/home/chsiensu/.conceptnet/intext.txt', 'r')
    
    #raw = openfile.read()
    
    # input text from the web-page
    raw = txt

    '''
        raw: the original, unprocessed blog paragraph text
    '''
    tStart = time.time()
    '''
        record start time
    '''
    articleLengthCheck = 1
    print '\n===>step 1: extract_concepts'
    bigram = []
    concepts = en_nl.extract_concepts(raw, max_words=2, check_conceptnet=True)
    '''
        extract_concepts:
        Extract a list of the concepts that are directly present in text.
        max_words specifies the maximum number of words in the concept.
        If check_conceptnet is True, only concepts that are in ConceptNet for this
        language will be returned.
    '''
    if len(concepts) < 20:
        articleLengthCheck = 0
    if articleLengthCheck:
        print '=> concepts:'
        for x in concepts:
            print x
            if len(x.split()) == 2:
                bigram.append(x.split()[0]+ '_'+ x.split()[1])
                '''
                    Reform "ice cream" into "ice_cream" and push "ice_cream" onto bigram
                '''
        print '=> size(concepts):',len(concepts)
        print '\n=> bigram:'
        for x in bigram:
            print x
        print '=> size(bigram):',len(bigram)
    
        print '\n===>step 2: get Part-of-Speech(POS) tags'
        remainTags = ['NN','NNP','NNS']
        '''
            remainTags:
            Only remain tags that appear in
            ['NN','NNP','NNS']
            see Brown Corpus
            http://en.wikipedia.org/wiki/Brown_Corpus
            
            original version of remainTags:
            remainTags = ['FW','JJ','JJR','JJT','NN','NN$','NNP','NNS','NP','NP$',
            'NPS','NPS$','NR','RB','RBR','RBT']
        '''
        raw2 = en_nl.tokenize(raw)
        '''
            en_nl.tokenize(raw):
            Inserts spaces in such a way that it separates
            punctuation from words, splits up contractions
        '''
    
        tokenizedRaw = nltk.word_tokenize(raw2)
        '''
            word_tokenize:
            Tokenizers divide strings into lists of substrings
            word_tokenize divide strings into lists of words
        '''
    
        posTag = nltk.pos_tag(tokenizedRaw)
        '''
            nltk.pos_tag:
            Use NLTK's currently recommended part of speech tagger to
            tag the given list of tokens.
        '''
    
        tags = []
        count = 0
        tagDepth = math.floor(math.log(len(tokenizedRaw))+2)
        #tagDepth = 8
        print '=> (token, normalized token, tag):'
        for tag in posTag:
            '''
                posTag:
                (friends, NNS)
                (Parking, NNP)
                ...
            '''
            if tag[1] in remainTags and len(tag[0]) > 2:
                try:
                    #wnTag = wn.synset(tag[0]+'.n.01')
                    wnTag = wn.synset(en_nl.word_split(tag[0])[0].lower()+'.n.01')
                    if len(wnTag.hypernym_distances()) > tagDepth:
                        count += 1
                        stemmedTag = en_nl.word_split(tag[0])
                        print tag[0], stemmedTag[0].lower(), tag[1], len(wnTag.hypernym_distances())
                        tags.append(stemmedTag[0].lower())
                        '''
                            stemmedTag:
                            normalized tokens, for example,
                            friends -> friend
                            Parking -> park
                        '''
                except:
                    pass
        print '=> size((token, normalized token, tag)):', count
    
        print '\n===>step 3: intersecttion of ( POS tag && extract_concepts )'
        '''
            In step 3,
            1) keywords = intersection of sets from (Part-of-Speech tags) and
               (extract_concepts)
            2) Classify these keywords into categories with desired distribution 
               (the largest category should not contained almost all the
               keywords)
        '''
        intersectTags = [x for x in tags if x in concepts]
        for x in bigram:
            try:
                wn.synset(x+'.n.01')
                intersectTags.append(x)
                '''
                    append bigrams on intersectTags
                '''
            except:
                pass
        print '=> intersectTags:'
        for x in intersectTags:
            print x
        print '=> size(intersectTags):', len(intersectTags)
        intersectTagsCopy = intersectTags
        intersectTags = list(set(intersectTags))
        category = []
        for x in intersectTags:
            category.append([[x] * intersectTagsCopy.count(x)])
        i = 0
        for x in intersectTags:
            category[i] = category[i][0]
            i += 1
        '''
            category:
            The set that the occurrences of the keywords is remained.
            [['dog', 'dog', dog'],
             ['cat', 'cat']
             ...
            ]
            
            intersectTags:
            The set that the occurrences of the keywords is NOT remained.
            [['dog'],
             ['cat']
             ...
            ]
        '''
    
        iteration = 1
        threshold = 1.4
        categoryRatio = 1.0
        categoryCopy = copy.deepcopy(category)
        '''
            threshold:
            we started the threshold  from 1.4 (through trial and error) of the
            Leacock-Chodorow Similarity, two keywords that their similarity is
            below 1.4 is discarded. however, if the threshold is too low to
            appropriate classify the keywords, then we will increase threshold
            by 0.1 at next iteration.
            
            categoryRatio:
            After categorize keywords into n seperated categories c(1),c(2)...
            c(n), we calculate the ratio of the largest categories by c(1) /
            ( c(1) + c(2) + c(3) ), where c(1) is the largest category, c(2)
            is the 2nd largest category and c(3) is the 3rd largest category.
    
            If the ratio is above 0.8, that means there are too many keywords
            in c(1), so we should reduce the keywords in c(1) and increase
            keywords in c(2) and c(3) (through increase the threshold by 0.1)
            to make the top 3 largest categories more evenly distributed
    
            categoryCopy:
            For restoring the category at next iteration
            
        '''
        outerCount = 0
        innerCount = 0
        tagSimilarity = []
        for tag1 in intersectTags:
            outerCount +=1
            for tag2 in intersectTags[outerCount:]:
                try:
                    '''
                        Why use try?
                        Some words(ex: adj, adv) will incorrect classified into nouns
                        and cause an error here: (tag1+'.n.01') and (tag2+'.n.01')
                        can only deal with nouns.
                    '''
                    wnTag1 = wn.synset(tag1+'.n.01')
                    wnTag2 = wn.synset(tag2+'.n.01')
                    if wnTag1.lch_similarity(wnTag2) > threshold:
                        tagSimilarity.append([wnTag1.lch_similarity(wnTag2), tag1, tag2])
                        '''
                            lch_similarity:
                            Leacock-Chodorow Similarity, returns a score denoting how similar 
                            two word senses are, based on the shortest path that connects
                            the senses (as above) and the maximum depth of the taxonomy in
                            which the senses occur. The relationship is given as -log(p/2d)
                            where p is the shortest path length and d the taxonomy depth.
                        '''
                        innerCount +=1
                except:
                    pass
        while (categoryRatio > 0.8):
            category = copy.deepcopy(categoryCopy)
            tagSimilarity = [x for x in tagSimilarity if x[0] > threshold]
            sortedTagSimilarity = sorted(tagSimilarity, key=lambda tag: tag[0], reverse=True)
            print '\n=> sortedTagSimilarity:'
            for s in sortedTagSimilarity:
                '''
                    sortedTagSimilarity:
                    (           s[0]             ,s[1], s[2])
                    (similarity of tag1 and tag4 ,tag1, tag4) ## largest similarity
                    (similarity of tag3 and tag5 ,tag3, tag5) ## 2nd largest similarity 
                    ...
    
                    In this FOR loop, we:
                    1) Pop a set that contain s[1] from the 'categories'
                    2) Pop a set that contain s[2] from the 'categories'
                    3) Merge sets from 1) and 2) to make a bigger set, so
                       the cardinality of which is the sum of 1) and 2)
                    4) Push it back to 'categories'
                '''
                count = 0
                list1 = []
                for x in category:
                    if s[1] in x:
                        list1 = category.pop(count)
                        break
                    count += 1
                count = 0
                list2 = []
                for x in category:
                    if s[2] in x:
                        list2 = category.pop(count)
                        break
                    count += 1
                for x in list2:
                    list1.append(x)
                category.append(list1)
                print s
            print '=> size(sortedTagSimilarity):', len(sortedTagSimilarity)
            sortedCategory = []
            for a in category:
                sortedCategory.append([len(a),a])
            sortedCategory = sorted(sortedCategory, key=lambda tag: tag[0], reverse=True)
            categorySum = sortedCategory[0][0] + sortedCategory[1][0] + sortedCategory[2][0]
            categoryRatio = float(sortedCategory[0][0]) / categorySum
    
            print '\n=> category:'
            for x in category:
                print x
            print '=> number of category           : ', len(category)
            print '=> threshold                    : ', threshold
            print '=> size of largest category     : ', sortedCategory[0][0]
            print '=> size of 2nd largest category : ', sortedCategory[1][0]
            print '=> size of 3rd largest category : ', sortedCategory[2][0]
            print '=> categoryRatio                : ', categoryRatio
            print '=> End of iteration             : ', iteration
            print '=> ' * 10
            iteration += 1
            threshold += 0.1
    
    
        print '\n===>step 4: category prediction'
        '''
            Find similar concepts of top largest 3 categories:
            *sortedCategory[0][1] (at most 4 concepts)
            *sortedCategory[1][1] (at most 4 concepts)
            *sortedCategory[2][1] (at most 2 concepts)
    
            Uniformity is also concerned. For example, if one category is
            ['dog', 'dog', 'dog', 'dog', 'cat', 'cat', 'cat', 'cat'],
            then at most 2 concepts will extract from this category,
            even if it has 8 elements
            
        '''
        category0 = divisi2.category(*sortedCategory[0][1])
        category1 = divisi2.category(*sortedCategory[1][1])
        category2 = divisi2.category(*sortedCategory[2][1])
        cnet= divisi2.network.conceptnet_matrix('en')
        '''
            reconstruct similarity matrix U*(Sigma^2)*(U^T)
        '''
        concept_axes, axis_weights, feature_axes= cnet.svd(k=100)
        sim = divisi2.reconstruct_similarity(concept_axes, axis_weights, post_normalize=True)
        category0_top4 = sim.left_category(category0).top_items(n=4)
        category1_top4 = sim.left_category(category1).top_items(n=4)
        category2_top2 = sim.left_category(category2).top_items(n=2)
        outputTemp = []
        uniformity0 = len(set(sortedCategory[0][1]))
        uniformity1 = len(set(sortedCategory[1][1]))
        uniformity2 = len(set(sortedCategory[2][1]))
        print '=> category0:'
        for x in category0_top4[ : min(uniformity0 , 4)]:
            outputTemp.append(x)
            print x
        print '=> category1:'
        for x in category1_top4[ : min(uniformity1 , 4)]:
            outputTemp.append(x)
            print x
        print '=> category2:'
        for x in category2_top2[ : min(uniformity2 , 2)]:
            outputTemp.append(x)
            print x
    
        print '\n===>step 5: output file and calculate execution time'
        '''
            output = ['keyword1','keyword2',...]
        '''
        print '=> statistics     :'
        print '=> words count    : ', len(tokenizedRaw)
        print '=> # of concepts  : ', len(concepts)
        print '=> # of tags      : ', len(tags)
        print '=> # of category  : ', len(category)
        output = []
        print '\n=> output:'
        for x in outputTemp:
            print x[0]
            output.append(x[0])
        tStop = time.time()
        '''
            record stop time
        '''
        print '\n=> execution time: ',(tStop - tStart), 'secs'
    else:
        output = 'The article is too short for me to extract concept'
        print output
        output = []

    return output
Exemplo n.º 7
0
def compute_sim_conceptnet(argv) : #word_dict, cn_words):
    if len(argv)<2:
        argv = ['-h']
    parser = OptionParser(description='construct+compare conceptnet and flickr word similarities')
    parser.add_option('-d', '--db_dir', dest='db_dir', default="", help='dir containing sqlite db files')
    parser.add_option("", '--db_dict', dest='db_dict', default="dict.db", help='dictionary')
    parser.add_option("", '--addl_vocab', dest='addl_vocab', default="places_etc.txt", help='')
    parser.add_option("", '--svd_dim', dest='svd_dim', type='int', default=150, help='')
    parser.add_option("-K", '--num_pairs', dest='num_pairs', type='int', default=1e6, help='')
    #parser.add_option("", '--bigram_file', dest='bigram_file', default="bigram_filtered.txt", help='')
    (opts, __args) = parser.parse_args(sys.argv)

    # intersect the two dictionaries first
    db_dict = os.path.join(opts.db_dir, opts.db_dict)
    addl_vocab = os.path.join(opts.db_dir, opts.addl_vocab)
        
    cn_words, cn_vocab = get_conceptnet_words(db_dict, addl_vocab)
    # build inverted index (words --> CN terms)
    cn_ii = dict.fromkeys(cn_vocab) 
    for w in cn_words.keys():
        for v in cn_words[w]:
            if not cn_ii[v]:
                cn_ii[v] = [w]
            else:
                cn_ii[v].append(w)
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s finished compiling inverted index for %d words from %d concept-net terms" % (tt, len(cn_words), len(cn_vocab))
    pickle.dump((cn_ii, cn_words), open(os.path.join(opts.db_dir, 'vocab_conceptnet_invertidx.pkl'), 'wb') )
    
    #print "done"
    #return

    # read conceptnet, write the top K similar pairs (by SVD) to file
    A = divisi2.network.conceptnet_matrix('en')
    B = A.normalize_all()
    U, S, _V = B.svd(k = opts.svd_dim)
    sim = divisi2.reconstruct_similarity(U, S, post_normalize=False).to_dense()
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s finished %dD-SVD for %s dimensions" % (tt, opts.svd_dim, repr(sim.shape))

    sim_out = open(os.path.join(opts.db_dir, 'conceptnet_sim_out.txt'), 'wt')

    nv = len(cn_vocab)
    ne = int(.5*nv*(nv-1))
    #vsim = [-.5]*ne #np.empty([.5*nv*(nv-1), 1], dtype=float)
    #vidx = ne*[[1, 2]]
    vcnt = 0
    for i in range(nv):
        for j in range(i):
            #vidx = [i, j]
            vsim = 0

            for wi in cn_ii[cn_vocab[i]]:
                for wj in cn_ii[cn_vocab[j]]:
                    if wi == wj:
                        continue
                    ii = sim.col_index(wi) # index of CN terms
                    jj = sim.col_index(wj)
                    vsim += sim[ii, jj]
            ww = sorted([cn_vocab[i], cn_vocab[j]])
            if vsim > 0:
                sim_out.write("%0.8f\t%s\t%s\n" % (vsim, ww[0], ww[1]) )
            vcnt += 1
            if vcnt % 10000==0:
                tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
                print "%s %d / %d similarities computed" % (tt, vcnt, ne)
    
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s Done.  %d / %d similarities computed" % (tt, vcnt, ne)
    sim_out.close()
    return   
         
    """
Exemplo n.º 8
0
 def __init__(self, num_axes):
   A = divisi2.network.conceptnet_matrix('en')
   concept_axes, axis_weights, feature_axes = A.svd(k=num_axes)
   self.sim = divisi2.reconstruct_similarity(concept_axes, axis_weights, post_normalize=True)  
   self.concept_axes = concept_axes
Exemplo n.º 9
0
def compute_sim_conceptnet(argv):  #word_dict, cn_words):
    if len(argv) < 2:
        argv = ['-h']
    parser = OptionParser(
        description='construct+compare conceptnet and flickr word similarities'
    )
    parser.add_option('-d',
                      '--db_dir',
                      dest='db_dir',
                      default="",
                      help='dir containing sqlite db files')
    parser.add_option("",
                      '--db_dict',
                      dest='db_dict',
                      default="dict.db",
                      help='dictionary')
    parser.add_option("",
                      '--addl_vocab',
                      dest='addl_vocab',
                      default="places_etc.txt",
                      help='')
    parser.add_option("",
                      '--svd_dim',
                      dest='svd_dim',
                      type='int',
                      default=150,
                      help='')
    parser.add_option("-K",
                      '--num_pairs',
                      dest='num_pairs',
                      type='int',
                      default=1e6,
                      help='')
    #parser.add_option("", '--bigram_file', dest='bigram_file', default="bigram_filtered.txt", help='')
    (opts, __args) = parser.parse_args(sys.argv)

    # intersect the two dictionaries first
    db_dict = os.path.join(opts.db_dir, opts.db_dict)
    addl_vocab = os.path.join(opts.db_dir, opts.addl_vocab)

    cn_words, cn_vocab = get_conceptnet_words(db_dict, addl_vocab)
    # build inverted index (words --> CN terms)
    cn_ii = dict.fromkeys(cn_vocab)
    for w in cn_words.keys():
        for v in cn_words[w]:
            if not cn_ii[v]:
                cn_ii[v] = [w]
            else:
                cn_ii[v].append(w)
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s finished compiling inverted index for %d words from %d concept-net terms" % (
        tt, len(cn_words), len(cn_vocab))
    pickle.dump(
        (cn_ii, cn_words),
        open(os.path.join(opts.db_dir, 'vocab_conceptnet_invertidx.pkl'),
             'wb'))

    #print "done"
    #return

    # read conceptnet, write the top K similar pairs (by SVD) to file
    A = divisi2.network.conceptnet_matrix('en')
    B = A.normalize_all()
    U, S, _V = B.svd(k=opts.svd_dim)
    sim = divisi2.reconstruct_similarity(U, S, post_normalize=False).to_dense()
    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s finished %dD-SVD for %s dimensions" % (tt, opts.svd_dim,
                                                     repr(sim.shape))

    sim_out = open(os.path.join(opts.db_dir, 'conceptnet_sim_out.txt'), 'wt')

    nv = len(cn_vocab)
    ne = int(.5 * nv * (nv - 1))
    #vsim = [-.5]*ne #np.empty([.5*nv*(nv-1), 1], dtype=float)
    #vidx = ne*[[1, 2]]
    vcnt = 0
    for i in range(nv):
        for j in range(i):
            #vidx = [i, j]
            vsim = 0

            for wi in cn_ii[cn_vocab[i]]:
                for wj in cn_ii[cn_vocab[j]]:
                    if wi == wj:
                        continue
                    ii = sim.col_index(wi)  # index of CN terms
                    jj = sim.col_index(wj)
                    vsim += sim[ii, jj]
            ww = sorted([cn_vocab[i], cn_vocab[j]])
            if vsim > 0:
                sim_out.write("%0.8f\t%s\t%s\n" % (vsim, ww[0], ww[1]))
            vcnt += 1
            if vcnt % 10000 == 0:
                tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
                print "%s %d / %d similarities computed" % (tt, vcnt, ne)

    tt = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
    print "%s Done.  %d / %d similarities computed" % (tt, vcnt, ne)
    sim_out.close()
    return
    """
Exemplo n.º 10
0
def parser(txt):
    # append the result into "output" and return it
    # then, remove the debug code in Step 8.
    output = []
    en_nl = get_nl('en')  # specify language
    #en = Language.get('en')         # specify language

    # load articles from file
    #if os.name == 'nt':
    #    openfile = open('./in.txt', 'r')
    #if os.name == 'posix':
    #    import sys
    #    sys.path.append('/home/chsiensu/.conceptnet/nltk-2.0.1rc1')
    #    openfile = open('/home/chsiensu/.conceptnet/intext.txt', 'r')

    #raw = openfile.read()

    # input text from the web-page
    raw = txt
    '''
        raw: the original, unprocessed blog paragraph text
    '''
    tStart = time.time()
    '''
        record start time
    '''
    articleLengthCheck = 1
    print '\n===>step 1: extract_concepts'
    bigram = []
    concepts = en_nl.extract_concepts(raw, max_words=2, check_conceptnet=True)
    '''
        extract_concepts:
        Extract a list of the concepts that are directly present in text.
        max_words specifies the maximum number of words in the concept.
        If check_conceptnet is True, only concepts that are in ConceptNet for this
        language will be returned.
    '''
    if len(concepts) < 20:
        articleLengthCheck = 0
    if articleLengthCheck:
        print '=> concepts:'
        for x in concepts:
            print x
            if len(x.split()) == 2:
                bigram.append(x.split()[0] + '_' + x.split()[1])
                '''
                    Reform "ice cream" into "ice_cream" and push "ice_cream" onto bigram
                '''
        print '=> size(concepts):', len(concepts)
        print '\n=> bigram:'
        for x in bigram:
            print x
        print '=> size(bigram):', len(bigram)

        print '\n===>step 2: get Part-of-Speech(POS) tags'
        remainTags = ['NN', 'NNP', 'NNS']
        '''
            remainTags:
            Only remain tags that appear in
            ['NN','NNP','NNS']
            see Brown Corpus
            http://en.wikipedia.org/wiki/Brown_Corpus
            
            original version of remainTags:
            remainTags = ['FW','JJ','JJR','JJT','NN','NN$','NNP','NNS','NP','NP$',
            'NPS','NPS$','NR','RB','RBR','RBT']
        '''
        raw2 = en_nl.tokenize(raw)
        '''
            en_nl.tokenize(raw):
            Inserts spaces in such a way that it separates
            punctuation from words, splits up contractions
        '''

        tokenizedRaw = nltk.word_tokenize(raw2)
        '''
            word_tokenize:
            Tokenizers divide strings into lists of substrings
            word_tokenize divide strings into lists of words
        '''

        posTag = nltk.pos_tag(tokenizedRaw)
        '''
            nltk.pos_tag:
            Use NLTK's currently recommended part of speech tagger to
            tag the given list of tokens.
        '''

        tags = []
        count = 0
        tagDepth = math.floor(math.log(len(tokenizedRaw)) + 2)
        #tagDepth = 8
        print '=> (token, normalized token, tag):'
        for tag in posTag:
            '''
                posTag:
                (friends, NNS)
                (Parking, NNP)
                ...
            '''
            if tag[1] in remainTags and len(tag[0]) > 2:
                try:
                    #wnTag = wn.synset(tag[0]+'.n.01')
                    wnTag = wn.synset(
                        en_nl.word_split(tag[0])[0].lower() + '.n.01')
                    if len(wnTag.hypernym_distances()) > tagDepth:
                        count += 1
                        stemmedTag = en_nl.word_split(tag[0])
                        print tag[0], stemmedTag[0].lower(), tag[1], len(
                            wnTag.hypernym_distances())
                        tags.append(stemmedTag[0].lower())
                        '''
                            stemmedTag:
                            normalized tokens, for example,
                            friends -> friend
                            Parking -> park
                        '''
                except:
                    pass
        print '=> size((token, normalized token, tag)):', count

        print '\n===>step 3: intersecttion of ( POS tag && extract_concepts )'
        '''
            In step 3,
            1) keywords = intersection of sets from (Part-of-Speech tags) and
               (extract_concepts)
            2) Classify these keywords into categories with desired distribution 
               (the largest category should not contained almost all the
               keywords)
        '''
        intersectTags = [x for x in tags if x in concepts]
        for x in bigram:
            try:
                wn.synset(x + '.n.01')
                intersectTags.append(x)
                '''
                    append bigrams on intersectTags
                '''
            except:
                pass
        print '=> intersectTags:'
        for x in intersectTags:
            print x
        print '=> size(intersectTags):', len(intersectTags)
        intersectTagsCopy = intersectTags
        intersectTags = list(set(intersectTags))
        category = []
        for x in intersectTags:
            category.append([[x] * intersectTagsCopy.count(x)])
        i = 0
        for x in intersectTags:
            category[i] = category[i][0]
            i += 1
        '''
            category:
            The set that the occurrences of the keywords is remained.
            [['dog', 'dog', dog'],
             ['cat', 'cat']
             ...
            ]
            
            intersectTags:
            The set that the occurrences of the keywords is NOT remained.
            [['dog'],
             ['cat']
             ...
            ]
        '''

        iteration = 1
        threshold = 1.4
        categoryRatio = 1.0
        categoryCopy = copy.deepcopy(category)
        '''
            threshold:
            we started the threshold  from 1.4 (through trial and error) of the
            Leacock-Chodorow Similarity, two keywords that their similarity is
            below 1.4 is discarded. however, if the threshold is too low to
            appropriate classify the keywords, then we will increase threshold
            by 0.1 at next iteration.
            
            categoryRatio:
            After categorize keywords into n seperated categories c(1),c(2)...
            c(n), we calculate the ratio of the largest categories by c(1) /
            ( c(1) + c(2) + c(3) ), where c(1) is the largest category, c(2)
            is the 2nd largest category and c(3) is the 3rd largest category.
    
            If the ratio is above 0.8, that means there are too many keywords
            in c(1), so we should reduce the keywords in c(1) and increase
            keywords in c(2) and c(3) (through increase the threshold by 0.1)
            to make the top 3 largest categories more evenly distributed
    
            categoryCopy:
            For restoring the category at next iteration
            
        '''
        outerCount = 0
        innerCount = 0
        tagSimilarity = []
        for tag1 in intersectTags:
            outerCount += 1
            for tag2 in intersectTags[outerCount:]:
                try:
                    '''
                        Why use try?
                        Some words(ex: adj, adv) will incorrect classified into nouns
                        and cause an error here: (tag1+'.n.01') and (tag2+'.n.01')
                        can only deal with nouns.
                    '''
                    wnTag1 = wn.synset(tag1 + '.n.01')
                    wnTag2 = wn.synset(tag2 + '.n.01')
                    if wnTag1.lch_similarity(wnTag2) > threshold:
                        tagSimilarity.append(
                            [wnTag1.lch_similarity(wnTag2), tag1, tag2])
                        '''
                            lch_similarity:
                            Leacock-Chodorow Similarity, returns a score denoting how similar 
                            two word senses are, based on the shortest path that connects
                            the senses (as above) and the maximum depth of the taxonomy in
                            which the senses occur. The relationship is given as -log(p/2d)
                            where p is the shortest path length and d the taxonomy depth.
                        '''
                        innerCount += 1
                except:
                    pass
        while (categoryRatio > 0.8):
            category = copy.deepcopy(categoryCopy)
            tagSimilarity = [x for x in tagSimilarity if x[0] > threshold]
            sortedTagSimilarity = sorted(tagSimilarity,
                                         key=lambda tag: tag[0],
                                         reverse=True)
            print '\n=> sortedTagSimilarity:'
            for s in sortedTagSimilarity:
                '''
                    sortedTagSimilarity:
                    (           s[0]             ,s[1], s[2])
                    (similarity of tag1 and tag4 ,tag1, tag4) ## largest similarity
                    (similarity of tag3 and tag5 ,tag3, tag5) ## 2nd largest similarity 
                    ...
    
                    In this FOR loop, we:
                    1) Pop a set that contain s[1] from the 'categories'
                    2) Pop a set that contain s[2] from the 'categories'
                    3) Merge sets from 1) and 2) to make a bigger set, so
                       the cardinality of which is the sum of 1) and 2)
                    4) Push it back to 'categories'
                '''
                count = 0
                list1 = []
                for x in category:
                    if s[1] in x:
                        list1 = category.pop(count)
                        break
                    count += 1
                count = 0
                list2 = []
                for x in category:
                    if s[2] in x:
                        list2 = category.pop(count)
                        break
                    count += 1
                for x in list2:
                    list1.append(x)
                category.append(list1)
                print s
            print '=> size(sortedTagSimilarity):', len(sortedTagSimilarity)
            sortedCategory = []
            for a in category:
                sortedCategory.append([len(a), a])
            sortedCategory = sorted(sortedCategory,
                                    key=lambda tag: tag[0],
                                    reverse=True)
            categorySum = sortedCategory[0][0] + sortedCategory[1][
                0] + sortedCategory[2][0]
            categoryRatio = float(sortedCategory[0][0]) / categorySum

            print '\n=> category:'
            for x in category:
                print x
            print '=> number of category           : ', len(category)
            print '=> threshold                    : ', threshold
            print '=> size of largest category     : ', sortedCategory[0][0]
            print '=> size of 2nd largest category : ', sortedCategory[1][0]
            print '=> size of 3rd largest category : ', sortedCategory[2][0]
            print '=> categoryRatio                : ', categoryRatio
            print '=> End of iteration             : ', iteration
            print '=> ' * 10
            iteration += 1
            threshold += 0.1

        print '\n===>step 4: category prediction'
        '''
            Find similar concepts of top largest 3 categories:
            *sortedCategory[0][1] (at most 4 concepts)
            *sortedCategory[1][1] (at most 4 concepts)
            *sortedCategory[2][1] (at most 2 concepts)
    
            Uniformity is also concerned. For example, if one category is
            ['dog', 'dog', 'dog', 'dog', 'cat', 'cat', 'cat', 'cat'],
            then at most 2 concepts will extract from this category,
            even if it has 8 elements
            
        '''
        category0 = divisi2.category(*sortedCategory[0][1])
        category1 = divisi2.category(*sortedCategory[1][1])
        category2 = divisi2.category(*sortedCategory[2][1])
        cnet = divisi2.network.conceptnet_matrix('en')
        '''
            reconstruct similarity matrix U*(Sigma^2)*(U^T)
        '''
        concept_axes, axis_weights, feature_axes = cnet.svd(k=100)
        sim = divisi2.reconstruct_similarity(concept_axes,
                                             axis_weights,
                                             post_normalize=True)
        category0_top4 = sim.left_category(category0).top_items(n=4)
        category1_top4 = sim.left_category(category1).top_items(n=4)
        category2_top2 = sim.left_category(category2).top_items(n=2)
        outputTemp = []
        uniformity0 = len(set(sortedCategory[0][1]))
        uniformity1 = len(set(sortedCategory[1][1]))
        uniformity2 = len(set(sortedCategory[2][1]))
        print '=> category0:'
        for x in category0_top4[:min(uniformity0, 4)]:
            outputTemp.append(x)
            print x
        print '=> category1:'
        for x in category1_top4[:min(uniformity1, 4)]:
            outputTemp.append(x)
            print x
        print '=> category2:'
        for x in category2_top2[:min(uniformity2, 2)]:
            outputTemp.append(x)
            print x

        print '\n===>step 5: output file and calculate execution time'
        '''
            output = ['keyword1','keyword2',...]
        '''
        print '=> statistics     :'
        print '=> words count    : ', len(tokenizedRaw)
        print '=> # of concepts  : ', len(concepts)
        print '=> # of tags      : ', len(tags)
        print '=> # of category  : ', len(category)
        output = []
        print '\n=> output:'
        for x in outputTemp:
            print x[0]
            output.append(x[0])
        tStop = time.time()
        '''
            record stop time
        '''
        print '\n=> execution time: ', (tStop - tStart), 'secs'
    else:
        output = 'The article is too short for me to extract concept'
        print output
        output = []

    return output
Exemplo n.º 11
0
 def create(self, U, S, post_normalize=False):
     self._matrix = reconstruct_similarity(U, S, post_normalize=post_normalize)
Exemplo n.º 12
0
    thebands.add(band)

NBANDS = len(thebands)
NBITS = 12
MODULO = 1<<NBITS
print NBANDS

matrix = divisi2.DenseMatrix(
    np.zeros((NBANDS, MODULO)),
    row_labels = thebands
)

file.seek(0)
counter = 0
for line in file:
    band, fan = line.strip().split()
    row = matrix.row_labels.index(band)
    col = hash(fan) % MODULO
    matrix[row,col] += 1
    counter += 1
    if counter % 1000 == 0:
        print >> sys.stderr, counter
file.close()

U, S, V = matrix.normalize_rows(offset=0.01).svd(k=20)
similar_bands = divisi2.reconstruct_similarity(U, S)
for band in thebands:
    similar = similar_bands.row_named(band).top_items(10)
    print "%s\t%s" % (band, similar)