示例#1
0
def main(argv):
    __usage__ = '''
    usage: python sentiment.py logs/fileprefix ClassifierName,s methodName,s ngramVal,s negtnVal,s
        ClassifierName,s:   %s
        methodName,s:       %s
        ngramVal,s:         %s
        negtnVal,s:         %s
    ''' % (str(LIST_CLASSIFIERS), str(LIST_METHODS), str([1, 3]), str([0, 1]))
    import sanderstwitter02
    import stanfordcorpus
    import stats

    fileprefix = ''

    if (len(argv) >= 1):
        fileprefix = str(argv[0])
    else:
        fileprefix = 'logs/run'

    classifierNames = []
    if (len(argv) >= 2):
        classifierNames = [
            name for name in argv[1].split(',') if name in LIST_CLASSIFIERS
        ]
    else:
        classifierNames = ['NaiveBayesClassifier']

    methodNames = []
    if (len(argv) >= 3):
        methodNames = [
            name for name in argv[2].split(',') if name in LIST_METHODS
        ]
    else:
        methodNames = ['1step']

    ngramVals = []
    if (len(argv) >= 4):
        ngramVals = [int(val) for val in argv[3].split(',') if val.isdigit()]
    else:
        ngramVals = [1]

    negtnVals = []
    if (len(argv) >= 5):
        negtnVals = [
            bool(int(val)) for val in argv[4].split(',') if val.isdigit()
        ]
    else:
        negtnVals = [False]

    if (len(fileprefix) == 0 or len(classifierNames) == 0
            or len(methodNames) == 0 or len(ngramVals) == 0
            or len(negtnVals) == 0):
        print __usage__
        return

    tweets1 = sanderstwitter02.getTweetsRawData('sentiment.csv')
    tweets2 = stanfordcorpus.getNormalisedTweets('stanfordcorpus/' +
                                                 stanfordcorpus.FULLDATA +
                                                 '.5000.norm.csv')
    #random.shuffle(tweets1)
    #random.shuffle(tweets2)
    tweets = tweets1 + tweets2
    random.shuffle(tweets)
    #tweets = tweets[:100]
    sys.stderr.write('\nlen( tweets ) = ' + str(len(tweets)))

    #sys.stderr.write( '\n' )
    #stats.preprocessingStats( tweets1, fileprefix='logs/stats_'+TIME_STAMP+'/TSC' )
    #sys.stderr.write( '\n' )
    #stats.preprocessingStats( tweets2, fileprefix='')#logs/stats_'+TIME_STAMP+'/STAN' )
    #sys.stderr.write( '\n' )
    #stats.stepStats( tweets , fileprefix='logs/stats_'+TIME_STAMP+'/Both' )

    #generateARFF(tweets, fileprefix)

    #print classifierNames, methodNames, ngramVals, negtnVals
    TIME_STAMP = get_time_stamp()
    for (((cname, mname), ngramVal),
         negtnVal) in grid(grid(grid(classifierNames, methodNames), ngramVals),
                           negtnVals):
        try:
            trainAndClassify(tweets,
                             classifier=cname,
                             method=mname,
                             feature_set={
                                 'ngram': ngramVal,
                                 'negtn': negtnVal
                             },
                             fileprefix=fileprefix + '_' + TIME_STAMP)
        except Exception, e:
            print e
示例#2
0
def main(argv) :
    __usage__='''
    usage: python sentiment.py --logstore logs/fileprefix --classifier ClassifierName,s --negate negtnVal,s --since date(yyyy-mm-aa) --until date(yyyy-mm-aa) --querysearch search_text --near city_of_tweets --maxtweets max_number --output filename_for_csv("Rajasthan_BJP.csv") --party partyname --sample sample_size_for_training
        ClassifierName,s:   %s
        negtnVal,s:         %s
    ''' % ( str( LIST_CLASSIFIERS ), str([0,1]) )
    import stanfordcorpus
    import electiontweets
    import stats
    import Exporter
    if len(argv) == 0:
        print('You must pass some parameters. Use \"-h\" to help.')
        return

    if len(argv) == 1 and argv[0] == '-h':
        print(__usage__)
        return

    opts, args = getopt.getopt(argv, "", ("logstore=", "classifier=", "negate=", "username="******"near=", "within=", "since=", "until=", "querysearch=", "toptweets", "maxtweets=", "output=", "party=", "sample="))
    fileprefix = 'logs/run'
    classifierNames = ['NaiveBayesClassifier']
    methodNames = ['1step']
    ngramVals = [ 1 ]
    negtnVals = [ False ]
    electionfile = 'output_got.csv'
    sample = 1000000
    for opt,arg in opts:
        if opt == '--logstore':
            fileprefix = arg
        if opt == '--classifier':
            classifierNames = [name for name in arg.split(',') if name in LIST_CLASSIFIERS]
        if opt == '--negate':
            negtnVals = [bool(int(val)) for val in arg.split(',') if val.isdigit()]
        if opt == '--output':
            electionfile = str(arg)
        if opt == '--sample':
            sample = str(arg)
    if not os.path.isfile(electionfile):
        Exporter.main(argv)

    if (len( fileprefix )==0 or len( classifierNames )==0 or len( methodNames )==0 or len( ngramVals )==0 or len( negtnVals )==0 ):
        print __usage__
        return
    if not os.path.isfile('stanfordcorpus/'+stanfordcorpus.FULLDATA+'.'+sample+'.norm.csv'):
        stanfordcorpus.randomSampleCSV('stanfordcorpus/'+stanfordcorpus.FULLDATA, 'stanfordcorpus/'+stanfordcorpus.FULLDATA+'.'+sample+'.sample.csv', K=int(sample))
        stanfordcorpus.getNormalisedCSV('stanfordcorpus/'+stanfordcorpus.FULLDATA+'.'+sample+'.sample.csv', 'stanfordcorpus/'+stanfordcorpus.FULLDATA+'.'+sample+'.norm.csv')
    tweets = stanfordcorpus.getNormalisedTweets('stanfordcorpus/'+stanfordcorpus.FULLDATA+'.'+sample+'.norm.csv')
    tweets3 = electiontweets.getTweetsRawData(electionfile)
    random.shuffle( tweets )
    sys.stderr.write( '\nlen( tweets ) = '+str(len( tweets )) )

    #sys.stderr.write( '\n' )
    #stats.preprocessingStats( tweets1, fileprefix='logs/stats_'+TIME_STAMP+'/TSC' )
    #sys.stderr.write( '\n' )
    #stats.preprocessingStats( tweets2, fileprefix='')#logs/stats_'+TIME_STAMP+'/STAN' )
    #sys.stderr.write( '\n' )
    #stats.stepStats( tweets , fileprefix='logs/stats_'+TIME_STAMP+'/Both' )

    #generateARFF(tweets, fileprefix)

    #print classifierNames, methodNames, ngramVals, negtnVals
    TIME_STAMP = get_time_stamp()
    for (((cname, mname), ngramVal), negtnVal) in grid( grid( grid( classifierNames, methodNames), ngramVals ), negtnVals ):
        try:
            trainAndClassify(
                tweets, tweets3, classifier=cname, method=mname,
                feature_set={'ngram':ngramVal, 'negtn':negtnVal},
                fileprefix=fileprefix+'_'+TIME_STAMP,
                ansfile=electionfile)
        except Exception, e:
            print e
示例#3
0
def main(argv):

    #####################################################################
    """python sentimenttry.py logs/fileprefix NaiveBayesClassifier,MaxentClassifier,DecisionTreeClassifier,SvmClassifier 1step,2step 1,3 0"""
    __usage__ = '''
    usage: python sentiment.py logs/fileprefix ClassifierName,s methodName,s ngramVal,s negtnVal,s
        example: python sentimenttry.py logs/fileprefix NaiveBayesClassifier,MaxentClassifier,DecisionTreeClassifier,SvmClassifier 1step,2step 1,3 0
        ClassifierName,s:   %s
        methodName,s:       %s
        ngramVal,s:         %s
        negtnVal,s:         %s
    ''' % (str(LIST_CLASSIFIERS), str(LIST_METHODS), str([1, 3]), str([0, 1]))

    fileprefix = ''
    if (len(argv) >= 1):
        fileprefix = str(argv[0])
    else:
        fileprefix = 'logs/run'

    classifierNames = []
    if (len(argv) >= 2):
        classifierNames = [
            name for name in argv[1].split(',') if name in LIST_CLASSIFIERS
        ]
    else:
        classifierNames = ['NaiveBayesClassifier']

    methodNames = []
    if (len(argv) >= 3):
        methodNames = [
            name for name in argv[2].split(',') if name in LIST_METHODS
        ]
    else:
        methodNames = ['1step']

    ngramVals = []
    if (len(argv) >= 4):
        ngramVals = [int(val) for val in argv[3].split(',') if val.isdigit()]
        """ngramVals = []
        for val in argv[3].split(','):
            if val.isdigit():
                ngramVals.append(int(val))"""
    else:
        ngramVals = [2]

    negtnVals = []
    if (len(argv) >= 5):
        negtnVals = [
            bool(int(val)) for val in argv[4].split(',') if val.isdigit()
        ]
    else:
        negtnVals = [False]

    print(classifierNames, methodNames, ngramVals, negtnVals)
    if (len(fileprefix) == 0 or len(classifierNames) == 0
            or len(methodNames) == 0 or len(ngramVals) == 0
            or len(negtnVals) == 0):
        print(__usage__)
        return

    tweets2 = stanfordcorpus.getNormalisedTweets('./stanfordcorpus/' +
                                                 stanfordcorpus.FULLDATA +
                                                 '.100000.norm.csv')

    random.shuffle(tweets2)

    sys.stderr.write("starting sentimental analysis")

    sys.stderr.write('\nlen( tweets ) = ' + str(len(tweets2)))

    sys.stderr.write('\n')
    sys.stdout.flush()

    try1.preprocessingStats(tweets2,
                            fileprefix='logs/stats_' + TIME_STAMP +
                            '/STAN')  #logs/stats_'+TIME_STAMP+'/STAN' )

    #print(tweets2)

    #tweets2 = ["gave my mother her mother's day present. she loved it ", 'pos', 'NO_QUERY', []]
    """for ((k,x),y) in grid(grid( classifierNames, methodNames),ngramVals):
        print((k,x),y)
    output for above ('NaiveBayesClassifier', '1step') 1
    ('NaiveBayesClassifier', '1step') 3
    ('NaiveBayesClassifier', '2step') 1
    ('NaiveBayesClassifier', '2step') 3
    ('MaxentClassifier', '1step') 1
    ('MaxentClassifier', '1step') 3
    ('MaxentClassifier', '2step') 1
    ('MaxentClassifier', '2step') 3
    ('DecisionTreeClassifier', '1step') 1
    ('DecisionTreeClassifier', '1step') 3
    ('DecisionTreeClassifier', '2step') 1
    ('DecisionTreeClassifier', '2step') 3
    ('SvmClassifier', '1step') 1
    ('SvmClassifier', '1step') 3
    ('SvmClassifier', '2step') 1
    ('SvmClassifier', '2step') 3 
    for (((k,x),y),p) in grid(grid(grid( classifierNames, methodNames),ngramVals),negtnVals):
        print(((k,x),y),p)
    (('NaiveBayesClassifier', '1step'), 1) False
    (('NaiveBayesClassifier', '1step'), 3) False
    (('NaiveBayesClassifier', '2step'), 1) False
    (('NaiveBayesClassifier', '2step'), 3) False
    (('MaxentClassifier', '1step'), 1) False
    (('MaxentClassifier', '1step'), 3) False
    (('MaxentClassifier', '2step'), 1) False
    (('MaxentClassifier', '2step'), 3) False
    (('DecisionTreeClassifier', '1step'), 1) False
    (('DecisionTreeClassifier', '1step'), 3) False
    (('DecisionTreeClassifier', '2step'), 1) False
    (('DecisionTreeClassifier', '2step'), 3) False
    (('SvmClassifier', '1step'), 1) False
    (('SvmClassifier', '1step'), 3) False
    (('SvmClassifier', '2step'), 1) False
    (('SvmClassifier', '2step'), 3) False"""
    TIME_STAMP1 = get_time_stamp()
    #defined above
    for (((cname, mname), ngramVal),
         negtnVal) in grid(grid(grid(classifierNames, methodNames), ngramVals),
                           negtnVals):
        try:
            #print(classifierNames, methodNames, ngramVals, negtnVals)
            print("Attempting trainAndClassify with These parameters", cname,
                  mname, ngramVal, negtnVal)
            trainAndClassify(tweets2,
                             classifier=cname,
                             method=mname,
                             feature_set={
                                 'ngram': ngramVal,
                                 'negtn': negtnVal
                             },
                             fileprefix=fileprefix + '_' + TIME_STAMP1)
        except Exception as e:
            print(e)
示例#4
0
def main(argv) :
    __usage__='''
    usage: python sentiment.py logs/fileprefix ClassifierName,s methodName,s ngramVal,s negtnVal,s
        ClassifierName,s:   %s
        methodName,s:       %s
        ngramVal,s:         %s
        negtnVal,s:         %s
    ''' % ( str( LIST_CLASSIFIERS ), str( LIST_METHODS ), str([1,3]), str([0,1]) )
    import sanderstwitter02
    import stanfordcorpus
    import stats

    fileprefix = ''

    if (len(argv) >= 1) :
        fileprefix = str(argv[0])
    else :
        fileprefix = 'logs/run'

    classifierNames = []
    if (len(argv) >= 2) :
        classifierNames = [name for name in argv[1].split(',') if name in LIST_CLASSIFIERS]
    else :
        classifierNames = ['NaiveBayesClassifier']

    methodNames = []
    if (len(argv) >= 3) :
        methodNames = [name for name in argv[2].split(',') if name in LIST_METHODS]
    else :
        methodNames = ['1step']

    ngramVals = []
    if (len(argv) >= 4) :
        ngramVals = [int(val) for val in argv[3].split(',') if val.isdigit()]
    else :
        ngramVals = [ 1 ]

    negtnVals = []
    if (len(argv) >= 5) :
        negtnVals = [bool(int(val)) for val in argv[4].split(',') if val.isdigit()]
    else :
        negtnVals = [ False ]

    if (len( fileprefix )==0 or len( classifierNames )==0 or len( methodNames )==0 or len( ngramVals )==0 or len( negtnVals )==0 ):
        print __usage__
        return
    
    tweets1 = sanderstwitter02.getTweetsRawData('sentiment.csv')
    tweets2 = stanfordcorpus.getNormalisedTweets('stanfordcorpus/'+stanfordcorpus.FULLDATA+'.5000.norm.csv')
    #random.shuffle(tweets1)
    #random.shuffle(tweets2)
    tweets = tweets1 + tweets2
    random.shuffle( tweets )
    #tweets = tweets[:100]
    sys.stderr.write( '\nlen( tweets ) = '+str(len( tweets )) )

    #sys.stderr.write( '\n' )
    #stats.preprocessingStats( tweets1, fileprefix='logs/stats_'+TIME_STAMP+'/TSC' )
    #sys.stderr.write( '\n' )
    #stats.preprocessingStats( tweets2, fileprefix='')#logs/stats_'+TIME_STAMP+'/STAN' )
    #sys.stderr.write( '\n' )
    #stats.stepStats( tweets , fileprefix='logs/stats_'+TIME_STAMP+'/Both' )

    #generateARFF(tweets, fileprefix)

    #print classifierNames, methodNames, ngramVals, negtnVals
    TIME_STAMP = get_time_stamp()
    for (((cname, mname), ngramVal), negtnVal) in grid( grid( grid( classifierNames, methodNames), ngramVals ), negtnVals ):
        try:
            trainAndClassify(
                tweets, classifier=cname, method=mname,
                feature_set={'ngram':ngramVal, 'negtn':negtnVal},
                fileprefix=fileprefix+'_'+TIME_STAMP )
        except Exception, e:
            print e