Exemplo n.º 1
0
def classify(line):
    log_sum_pos=0.0
    log_deno=0.0
    # line=raw_input()

    tuple=tweet_filter.sub("",word_normalize.sub(r'\1\1',sub(r'(\w)\1+\b', r'\1', line.lower())))

    #Convert tweet into a list of words
    feature=list(tuple.split())

    for word in range(0,len(feature)-1):
        bigram=filter_tweet(feature[word])+" "+filter_tweet(feature[word+1])

        # Antonym replacer
        bigram=neg.replace_negations(bigram)

        temp=0.0
        if len(bigram) > 1:

            unigramList=bigram.split()
            if bigram in posBigramdic:
                temp=float(posBigramdic[bigram])
                log_sum_pos+=log(float(temp)/(pos_dic_len/total_len))
                if temp>0 :
                    log_deno+=log(temp)

            else :

                for unigram in unigramList:
                    if unigram in posunigramdic:
                        temp=float(posunigramdic[unigram])
                        log_sum_pos+=log(float(temp)/(pos_dic_len/total_len))
                        if temp>0 :
                            log_deno+=log(temp)


            if bigram in negBigramdic:
                temp+=float(negBigramdic[bigram])
                if temp>0 :
                    log_deno+=log(temp)

            else :
                for unigram in unigramList:
                    if unigram in negunigramdic:
                        temp=float(negunigramdic[unigram])
                        if temp>0 :
                            log_deno+=log(temp)

        else :
            if bigram in posunigramdic:

                temp=float(posunigramdic[bigram])
                log_sum_pos+=log(float(temp)/(pos_dic_len/total_len))

            if bigram in negunigramdic:
                temp+=float(negunigramdic[bigram])

            if temp>0 :
                log_deno+=log(temp)



            #print "\nTemp > 0",word,temp,log_sum_pos

    #print "VALUES",log_deno,log_sum_pos

    log_pos_prob=log(0.5)

    #log_sum=log_sum_pos+log_sum_neg+log_pos_prob

    log_sum=(log_sum_pos+log_pos_prob)-log_deno

    print exp(log_sum),line
    #print log_sum,line
    if exp(log_sum)>limits.ClassifierThreshold:
        return '1'
    else:
        return '0'
Exemplo n.º 2
0
def classify(line):
    log_sum_pos = 0.0
    log_deno = 0.0
    # line=raw_input()

    tuple = tweet_filter.sub(
        "", word_normalize.sub(r'\1\1', sub(r'(\w)\1+\b', r'\1',
                                            line.lower())))

    #Convert tweet into a list of words
    feature = list(tuple.split())

    for word in range(0, len(feature) - 1):
        bigram = filter_tweet(feature[word]) + " " + filter_tweet(
            feature[word + 1])

        # Antonym replacer
        bigram = neg.replace_negations(bigram)

        temp = 0.0
        if len(bigram) > 1:

            unigramList = bigram.split()
            if bigram in posBigramdic:
                temp = float(posBigramdic[bigram])
                log_sum_pos += log(float(temp) / (pos_dic_len / total_len))
                if temp > 0:
                    log_deno += log(temp)

            else:

                for unigram in unigramList:
                    if unigram in posunigramdic:
                        temp = float(posunigramdic[unigram])
                        log_sum_pos += log(
                            float(temp) / (pos_dic_len / total_len))
                        if temp > 0:
                            log_deno += log(temp)

            if bigram in negBigramdic:
                temp += float(negBigramdic[bigram])
                if temp > 0:
                    log_deno += log(temp)

            else:
                for unigram in unigramList:
                    if unigram in negunigramdic:
                        temp = float(negunigramdic[unigram])
                        if temp > 0:
                            log_deno += log(temp)

        else:
            if bigram in posunigramdic:

                temp = float(posunigramdic[bigram])
                log_sum_pos += log(float(temp) / (pos_dic_len / total_len))

            if bigram in negunigramdic:
                temp += float(negunigramdic[bigram])

            if temp > 0:
                log_deno += log(temp)

            #print "\nTemp > 0",word,temp,log_sum_pos

    #print "VALUES",log_deno,log_sum_pos

    log_pos_prob = log(0.5)

    #log_sum=log_sum_pos+log_sum_neg+log_pos_prob

    log_sum = (log_sum_pos + log_pos_prob) - log_deno

    print exp(log_sum), line
    #print log_sum,line
    if exp(log_sum) > limits.ClassifierThreshold:
        return '1'
    else:
        return '0'
Exemplo n.º 3
0
def classify(line,Company):
    log_sum_pos=0.0
    log_sum_neg=0.0
    log_deno=0.0

    if Company=="aapl" :
        posd=aapl_posd
        negd=aapl_negd
    elif Company=="bac" :
        posd=bac_posd
        negd=bac_negd
    elif Company=="goog" :
        posd=goog_posd
        negd=goog_negd

    lne=""
    tweet=tweet_filter.sub("",word_normalize.sub(r'\1\1',sub(r'(\w)\1+\b', r'\1', line.lower()))).split() #twitter hack!                                              
    for feature in tweet:
        #print feature
        #word=feature
        if dicSpell.has_key(feature):
            feature=dicSpell[feature]
        #feature=sp.correct(feature)
        word=stem.PorterStemmer().stem(feature)
        if word not in stop:
            if dicSyn.has_key(word):
                word=dicSyn[word]
        lne=lne+" "+word
    lne = neg.replace_negations(lne)
 
    # bi=defaultdict(int)

    feature=set(list(lne.split()))
    # for word in range(0,len(feature)-1):
    #     bi[feature[word]+" "+feature[word+1]]+=1
        
    for word in feature:
        temp=0.0
        if word in posd:
            temp=float(posd[word])
            log_sum_pos+=log(float(temp))#/(pos_dic_len/total_len))
            #print "Positive",word,log_sum_pos,posd[word],temp
        if word in negd:
            temp+=float(negd[word])
            log_sum_neg+=log(float(temp))
            #print "\nNegative",word,log_sum_pos,negd[word],temp
        if temp>0 :
            log_deno+=log(temp)
            log_sum_pos+=log(2.1)
            log_sum_neg+=log(1)
            #print "\nTemp > 0",word,temp,log_sum_pos

    #print "VALUES",log_deno,log_sum_pos

    log_pos_prob=log(0.5)
    log_neg_prob=log(0.5)

    #log_sum=log_sum_pos+log_sum_neg+log_pos_prob

    log_sum=(log_sum_pos+log_pos_prob)-log_deno
    n_log_sum=(log_sum_neg+log_neg_prob)-log_deno

    difference=exp(log_sum)-exp(n_log_sum)

    print Company,difference,line
    #print log_sum,line
    if difference>=0.1:
        return 2
    else:
        return 0 #BAD NEGATIVE
Exemplo n.º 4
0
def classify(line, Company):
    log_sum_pos = 0.0
    log_sum_neg = 0.0
    log_deno = 0.0

    if Company == "aapl":
        posd = aapl_posd
        negd = aapl_negd
    elif Company == "bac":
        posd = bac_posd
        negd = bac_negd
    elif Company == "goog":
        posd = goog_posd
        negd = goog_negd

    lne = ""
    tweet = tweet_filter.sub(
        "", word_normalize.sub(r'\1\1',
                               sub(r'(\w)\1+\b', r'\1',
                                   line.lower()))).split()  #twitter hack!
    for feature in tweet:
        #print feature
        #word=feature
        if dicSpell.has_key(feature):
            feature = dicSpell[feature]
        #feature=sp.correct(feature)
        word = stem.PorterStemmer().stem(feature)
        if word not in stop:
            if dicSyn.has_key(word):
                word = dicSyn[word]
        lne = lne + " " + word
    lne = neg.replace_negations(lne)

    # bi=defaultdict(int)

    feature = set(list(lne.split()))
    # for word in range(0,len(feature)-1):
    #     bi[feature[word]+" "+feature[word+1]]+=1

    for word in feature:
        temp = 0.0
        if word in posd:
            temp = float(posd[word])
            log_sum_pos += log(float(temp))  #/(pos_dic_len/total_len))
            #print "Positive",word,log_sum_pos,posd[word],temp
        if word in negd:
            temp += float(negd[word])
            log_sum_neg += log(float(temp))
            #print "\nNegative",word,log_sum_pos,negd[word],temp
        if temp > 0:
            log_deno += log(temp)
            log_sum_pos += log(2.1)
            log_sum_neg += log(1)
            #print "\nTemp > 0",word,temp,log_sum_pos

    #print "VALUES",log_deno,log_sum_pos

    log_pos_prob = log(0.5)
    log_neg_prob = log(0.5)

    #log_sum=log_sum_pos+log_sum_neg+log_pos_prob

    log_sum = (log_sum_pos + log_pos_prob) - log_deno
    n_log_sum = (log_sum_neg + log_neg_prob) - log_deno

    difference = exp(log_sum) - exp(n_log_sum)

    print Company, difference, line
    #print log_sum,line
    if difference >= 0.1:
        return 2
    else:
        return 0  #BAD NEGATIVE