def wpre(self):
     x = DataParser('weatherNYC.csv')
     sum = 0
     for i in range(360):
         if (x.dates[i].isocalendar()[1] == self.week):
             sum = sum + x.precipations[i]
     return sum
Exemplo n.º 2
0
 def mavg(self):
     sum = 0
     counter = 0
     x = DataParser('weatherNYC.csv')
     for i in range(len(x.dates)):
         if (x.dates[i].month == self.month):
             sum = sum + x.avgs[i]
             counter = counter + 1
     return sum / counter
Exemplo n.º 3
0
    def mmax(self):
        mmax = -500

        x = DataParser('weatherNYC.csv')
        dates = x.dates
        maxs = x.maxs
        for i in range(len(dates)):
            if (dates[i].month == self.month):
                if (maxs[i] > mmax):
                    mmax = maxs[i]
        return mmax
Exemplo n.º 4
0
    def mmin(self):
        mmin = 1000

        x = DataParser('weatherNYC.csv')
        dates = x.dates
        mins = x.mins
        for i in range(len(dates)):
            if (dates[i].month == self.month):
                if (mins[i] < mmin):
                    mmin = mins[i]
        return mmin
Exemplo n.º 5
0
def main():
    _initWordDic()
    # parse the data using dataParser
    parser = DataParser()
    docs, summary = parser.parseFile()
    p_doc = Preparer(docs)
    p_summary = Preparer(summary, is_summary=True)
    p_doc.cutDocs()
    p_summary.cutDocs()
    docLens = p_doc.countDocs()
    sumLens = p_summary.countDocs()
    print(max(sumLens))
    #sys.exit()
    p_doc.doc2Int()
    p_summary.doc2Int()
    # docs, docLens, summary, sumLens are the data
    data = list(zip(docs, summary, docLens, sumLens))
    training_data = data[:1585]
    validation_data = data[:1835]
    testing_data = data[1835:]
    ''' FIXING THE DIMENSION ISSUES OF BATCHES
    sf_train = SF(training_data, CONFIG.BATCH_SIZE, is_training = True)
    sf_valid = SF(validation_data, CONFIG.BATCH_SIZE, is_training = False)
    for tup in sf_train.get_batch(): 
        _, doc, summary, docLens, sumLens = tup
        doc_batch = _get_doc_batch(doc)
        summary_batch = _get_summary_batch(summary)
        label_batch = _get_label_batch(summary)
        docLens = np.array(docLens)
        summaryLens = np.array(sumLens)  
        print (doc_batch[0])
        print (summary_batch[0])
        print (label_batch[0])
        print (list(doc for doc in docLens))
        print (list(doc for doc in summaryLens))
        sys.exit()'''

    with tf.Graph().as_default():
        initializer = tf.random_uniform_initializer(-1, 1)
        with tf.name_scope('Train'):
            with tf.variable_scope('Model',
                                   reuse=None,
                                   initializer=initializer):
                m = SummaryModel(is_training=True)
        with tf.name_scope('Valid'):
            with tf.variable_scope('Model',
                                   reuse=True,
                                   initializer=initializer):
                m_valid = SummaryModel(is_training=False)
        with tf.name_scope('Test'):
            with tf.variable_scope('Model',
                                   reuse=True,
                                   initializer=initializer):
                m_test = SummaryModel(is_training=False)

        init_op = tf.global_variables_initializer()
        config = tf.ConfigProto()
        config.gpu_options.visible_device_list = '7'
        sess = tf.Session(config=config)
        sess.run(init_op)
        for epoch in range(CONFIG.EPOCH):
            print('---------------running ' + str(epoch) +
                  'th epoch ----------------')
            run_epoch(sess, m, m_valid, training_data, validation_data)
Exemplo n.º 6
0
def main():
    db = initializeMongoDB()
    tweetsTrainingCollection = db["tweetsTrainData"]
    fetchDataFrom= "Kaggle"
    data = []

    tracemalloc.start()
    getMemoryUsage()


    if (fetchDataFrom == "db"):
        tokensForModel = DataParser.parseTweets(True, NewDataRepository.getData(),"")
        NewDataRepository.saveTrainingAndTestData(tokensForModel,db,"False")
        positiveTrainingTweets = NewDataRepository.getPositiveDBTweets(tweetsTrainingCollection)
        negativeTrainingTweets = NewDataRepository.getNegativeDBTweets(tweetsTrainingCollection)
        data = DataParser.prepareDataForTraining(True,positiveTrainingTweets,negativeTrainingTweets)
   
    if (fetchDataFrom == "twitter_samples" ):
        #positive_tweets = twitter_samples.strings('positive_tweets.json')
        #negative_tweets = twitter_samples.strings('negative_tweets.json')
        positiveTrainingTweets = DataParser.parseTweets(False,SampleRepository.getPositiveTweets(),"positive")
        negativeTrainingTweets =  DataParser.parseTweets(False,SampleRepository.getNegativeTweets(),"negative")

        data = DataParser.prepareDataForTraining(False,positiveTrainingTweets,negativeTrainingTweets)

    if (fetchDataFrom == "Kaggle"):
        tweets = KaggleRepository.getExtensiveCSVTweetsForTraining()
        negativeTweets = []
        poistiveTweets = []
        for tweet in tweets:
            if (tweet[0] == 0):
                negativeTweets.append(tweet[1])
            if (tweet[0] == 4):
                poistiveTweets.append(tweet[1])
        del tweets
        negativeTrainingTweets = DataParser.parseTweets(True,negativeTweets[:10000],"negative")
        positiveTrainingTweets = DataParser.parseTweets(True,poistiveTweets[:10000],"positive")
        getMemoryUsage()
        del negativeTweets
        del poistiveTweets
        #gc.collect()
        getMemoryUsage()
        data = DataParser.prepareDataForTraining(False,positiveTrainingTweets,negativeTrainingTweets)
        del positiveTrainingTweets
        del negativeTrainingTweets
        getMemoryUsage()
    percentageOfTrainingData = 0.7
    trainingData = data[:(int(len(data)*percentageOfTrainingData))]
    testData = data[(int(len(data)*percentageOfTrainingData)):]
    del data
    getMemoryUsage()
    #KaggleRepository.saveTweetsInFile(trainingData,"Positive")
    #KaggleRepository.saveTweetsInFile(trainingData,"Negative")

    bayesClassifier = BayesClassifier(0)
    bayesClassifier.train(trainingData)
    bayesClassifier.setAccuracy(testData)
    print("accuracy : " , bayesClassifier.getAccuracy())
    #del trainingData
    #del testData
    #getMemoryUsage()
    #gc.collect
    #customTweet = 'Thank you for sending my baggage to CityX and flying me to CityY at the same time... Brilliant service. #thanksGenericAirline'
    customTweet = 'With this said, I think we are going to the moon'
    customTokens = DataParser.removeNoise(customTweet,stopwords.words('english'))
    print(bayesClassifier.avalueTweet(customTokens))
    customTweet = 'With this said, I think we are going all the way down'
    customTokens = DataParser.removeNoise(customTweet,stopwords.words('english'))
    print(bayesClassifier.avalueTweet(customTokens))
Exemplo n.º 7
0
def buildIndex(
    iterations,
    reset=True,
    resetFiles=True,
    passwordLock=True,
    dev=False,
    options={
        'crawl': True,
        'pageRank': True,
        'parse': True,
        'database': True,
        'idf': True,
        'tfidf': True
    }):
    log('build index', 'Running full suite of crawler programs.')
    programStartTime = time.time()

    loginSuccess = False
    if reset and passwordLock:
        log("info", "You are about to reset the database")
        pwd = getpass('Enter password to continue:').encode('UTF-8')
        if (bcrypt.checkpw(pwd, loginPwd)):
            loginSuccess = True
            log('login', 'Login successful. Resetting databases.')
        else:
            log('login', 'Login failed. Reset operation not performed')

    else:
        loginSuccess = True

    if resetFiles and exists('domains'):
        log('cleanup', 'Removing old domains folder')
        rmtree('./domains')

    reset and loginSuccess and DatabaseBuilder.resetInvertedIndex(
    ) and DatabaseBuilder.resetCrawler()

    for domain in domains:
        domainStartTime = time.time()

        if options['crawl']:
            crawler = Crawler(domain['name'], domain['root'])
            crawler.runSpider(iterations)

        inlinkGraphFile = 'domains/' + domain['name'] + '/' + domain[
            'name'] + '_inlinks.json'
        outlinkGraphFile = 'domains/' + domain['name'] + '/' + domain[
            'name'] + '_outlinks.json'
        options['pageRank'] and calculatePageRank(
            domain['name'], inlinkGraphFile, outlinkGraphFile, 3)

        if options['parse']:
            dataParser = DataParser(domain['name'])
            dataParser.runParser()

        if options['database']:
            databaseBuilder = DatabaseBuilder(domain['name'],
                                              mode='DEV' if dev else 'PROD')
            databaseBuilder.build()

        log(
            "time", domain['name'] + " finished running in " +
            str(time.time() - domainStartTime) + " seconds.")

    options['idf'] and DatabaseBuilder.calculateIDF()
    options['tfidf'] and calculateTFIDF()
    log(
        "time", "Program finished running in " +
        str(time.time() - programStartTime) + " seconds.")
Exemplo n.º 8
0
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.animation as animation


from dataParser import DataParser
import matplotlib.pyplot as plt
from matplotlib import style

style.use('dark_background')


x = DataParser('weatherNYC.csv')
plt.plot(x.dates, x.avgs)
plt.title('temperatura w NYC w 2016 (w ℉)')
plt.show()
Exemplo n.º 9
0
 def test_mins(self):
     x = DataParser('test.csv')
     self.assertEqual(x.mins, [71, 75])
Exemplo n.º 10
0
 def test_maxs(self):
     x = DataParser('test.csv')
     self.assertEqual(x.maxs, [89, 91])
Exemplo n.º 11
0
 def test_precipations(self):
     x = DataParser('test.csv')
     self.assertEqual(x.precipations, [0, 0.22])
Exemplo n.º 12
0
 def test_avgs(self):
     x = DataParser('test.csv')
     self.assertEqual(x.avgs, [80, 83])