예제 #1
0
            f_out = open(processed_tweets_file,'w')

            tweets_list = []
            tweet_total = 0
            lost_tweets = 0
            line_number = 0

            with open(rawTweetsFile) as f:
                for line in f:

                    try:
                        line_number += 1
                        line = line.strip()

                        tweet_out_string = tweetprocessing.process_tweet(line, track_list, expand_url=EXPAND_URLS)
                        f_out.write(tweet_out_string)
                        tweet_total += 1
                        # print tweet_out_string

                    except ValueError, e:
                        lost_tweets += 1
                        print "ValueError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile)
                        logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile))
                        logging.exception(e)
                        print traceback.format_exc()
                        pass
                    except TypeError, e:
                        lost_tweets += 1
                        print "TypeError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile)
                        logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile))
예제 #2
0
def go(project_id, rawdir, archdir, insertdir, logdir):
    # Connects to project account DB
    project = db.get_project_detail(project_id)
    project_name = project['project_name']

    configdb = project['project_config_db']
    conn = db.connection[configdb]
    project_config_db = conn.config

    # Reference for controller if script is active or not.
    project_config_db.update({'module': 'twitter'},
                             {'$set': {
                                 'processor_active': 1
                             }})

    Config = ConfigParser.ConfigParser()
    Config.read(PLATFORM_CONFIG_FILE)

    # Creates logger w/ level INFO
    logger = logging.getLogger('preprocess')
    logger.setLevel(logging.INFO)
    # Creates rotating file handler w/ level INFO
    fh = logging.handlers.TimedRotatingFileHandler(
        logdir + '/' + project_name + '-processor-log-' + project_id + '.out',
        'D', 1, 30, None, False, False)
    fh.setLevel(logging.INFO)
    # Creates formatter and applies to rotating handler
    format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    datefmt = '%m-%d %H:%M'
    formatter = logging.Formatter(format, datefmt)
    fh.setFormatter(formatter)
    # Finishes by adding the rotating, formatted handler
    logger.addHandler(fh)

    logger = logging.getLogger('preprocess')
    logger.info('Starting preprocess system')

    if not os.path.exists(rawdir + '/error_tweets/'):
        os.makedirs(rawdir + '/error_tweets/')

    error_tweet = open(
        rawdir + '/error_tweets/error_tweet-' + project_name + '-' +
        project_id + '.txt', 'a')

    module_config = project_config_db.find_one({'module': 'twitter'})
    runPreProcessor = module_config['processor']['run']

    if runPreProcessor:
        print 'Starting runPreProcessor'
        logger.info('Preprocess start signal')
    runLoopSleep = 0

    while runPreProcessor:

        # Get all terms for all collectors
        track_list = []
        for collector in project['collectors']:
            if collector['terms_list']:
                tmp_terms = [term['term'] for term in collector['terms_list']]
                track_list += tmp_terms

        if track_list:
            track_list = list(set(track_list))

        tweetsFileList = get_tweet_file_queue(Config, rawdir)
        files_in_queue = len(tweetsFileList)

        if files_in_queue < 1:
            time.sleep(180)
        else:
            logger.info('Queue length is %d' % files_in_queue)
            rawTweetsFile = tweetsFileList[0]
            logger.info('Preprocess raw file: %s' % rawTweetsFile)

            processed_tweets_file = get_processed_tweets_file_name(
                Config, rawTweetsFile, rawdir, archdir)

            # TODO - Dynamic copy time
            # lame workaround, but for now we assume it will take less than a minute to
            # copy a file so this next sleep is here to wait for a copy to finish on the
            # off chance that we happy to see it just as it is being copied to the directory
            time.sleep(60)

            f_out = open(processed_tweets_file, 'w')

            tweets_list = []
            tweet_total = 0
            lost_tweets = 0
            line_number = 0

            with open(rawTweetsFile) as f:
                if '-delete-' not in rawTweetsFile and '-streamlimits-' not in rawTweetsFile:
                    for line in f:
                        try:
                            line_number += 1
                            line = line.strip()

                            tweet_out_string = tweetprocessing.process_tweet(
                                line, track_list, expand_url=EXPAND_URLS)
                            f_out.write(tweet_out_string)
                            tweet_total += 1
                            # print tweet_out_string

                        except ValueError, e:
                            lost_tweets += 1
                            print "ValueError. tweet not processed: %d (%s)" % (
                                line_number, rawTweetsFile)
                            logger.warning("tweet not processed: %d (%s)" %
                                           (line_number, rawTweetsFile))
                            logging.exception(e)
                            error_tweet.write(line + "\n")
                            print traceback.format_exc()
                            pass
                        except TypeError, e:
                            lost_tweets += 1
                            print "TypeError. tweet not processed: %d (%s)" % (
                                line_number, rawTweetsFile)
                            logger.warning("tweet not processed: %d (%s)" %
                                           (line_number, rawTweetsFile))
                            logging.exception(e)
                            error_tweet.write(line + "\n")
                            print traceback.format_exc()
                            pass
                        except KeyError, e:
                            lost_tweets += 1
                            print "KeyError. tweet not processed: %d (%s)" % (
                                line_number, rawTweetsFile)
                            logger.warning("tweet not processed: %d (%s)" %
                                           (line_number, rawTweetsFile))
                            logging.exception(e)
                            error_tweet.write(line + "\n")
                            print traceback.format_exc()
                            pass
                elif '-streamlimits-' in rawTweetsFile:
                    server_name = os.uname()[1]
                    try:
                        collector_id = rawTweetsFile.split('-')[6]
                        collector = db.get_collector_detail(
                            project_id=project_id, collector_id=collector_id)
                        col_type = collector['collector']['api']
                    except:
                        col_type = 'UNDEFINED'
                    for line in f:
                        line = line.strip()
                        limit_out_string = tweetprocessing.process_limit(
                            line, col_type, server_name, project_name,
                            project_id, collector_id)
                        f_out.write(limit_out_string)
예제 #3
0
def go(project_id, rawdir, archdir, insertdir, logdir):
    # Connects to project account DB
    project = db.get_project_detail(project_id)
    project_name = project['project_name']

    configdb = project['project_config_db']
    conn = db.connection[configdb]
    project_config_db = conn.config

    # Reference for controller if script is active or not.
    project_config_db.update({'module': 'twitter'}, {'$set': {'processor_active': 1}})

    Config = ConfigParser.ConfigParser()
    Config.read(PLATFORM_CONFIG_FILE)

     # Creates logger w/ level INFO
    logger = logging.getLogger('preprocess')
    logger.setLevel(logging.INFO)
    # Creates rotating file handler w/ level INFO
    fh = logging.handlers.TimedRotatingFileHandler(logdir + '/' + project_name + '-processor-log-' + project_id + '.out', 'D', 1, 30, None, False, False)
    fh.setLevel(logging.INFO)
    # Creates formatter and applies to rotating handler
    format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    datefmt = '%m-%d %H:%M'
    formatter = logging.Formatter(format, datefmt)
    fh.setFormatter(formatter)
    # Finishes by adding the rotating, formatted handler
    logger.addHandler(fh)

    logger = logging.getLogger('preprocess')
    logger.info('Starting preprocess system')

    if not os.path.exists(rawdir + '/error_tweets/'):
        os.makedirs(rawdir + '/error_tweets/')

    error_tweet = open(rawdir + '/error_tweets/error_tweet-' + project_name + '-' + project_id + '.txt', 'a')

    module_config = project_config_db.find_one({'module': 'twitter'})
    runPreProcessor = module_config['processor']['run']

    if runPreProcessor:
        print 'Starting runPreProcessor'
        logger.info('Preprocess start signal')
    runLoopSleep = 0

    while runPreProcessor:

        # Get all terms for all collectors
        track_list = []
        for collector in project['collectors']:
            if collector['terms_list']:
                tmp_terms = [term['term'] for term in collector['terms_list']]
                track_list += tmp_terms

        if track_list:
            track_list = list(set(track_list))

        tweetsFileList = get_tweet_file_queue(Config, rawdir)
        files_in_queue = len(tweetsFileList)

        if files_in_queue < 1:
            time.sleep( 180 )
        else:
            logger.info('Queue length is %d' % files_in_queue)
            rawTweetsFile = tweetsFileList[0]
            logger.info('Preprocess raw file: %s' % rawTweetsFile)

            processed_tweets_file = get_processed_tweets_file_name(Config, rawTweetsFile, rawdir, archdir)

            # TODO - Dynamic copy time
            # lame workaround, but for now we assume it will take less than a minute to
            # copy a file so this next sleep is here to wait for a copy to finish on the
            # off chance that we happy to see it just as it is being copied to the directory
            time.sleep( 60 )

            f_out = open(processed_tweets_file,'w')

            tweets_list = []
            tweet_total = 0
            lost_tweets = 0
            line_number = 0

            with open(rawTweetsFile) as f:
                if '-delete-' not in rawTweetsFile and '-streamlimits-' not in rawTweetsFile:
                    for line in f:
                        try:
                            line_number += 1
                            line = line.strip()
                            
                            tweet_out_string = tweetprocessing.process_tweet(line, track_list, expand_url=EXPAND_URLS)
                            f_out.write(tweet_out_string)
                            tweet_total += 1
                            # print tweet_out_string
    
                        except ValueError, e:
                            lost_tweets += 1
                            print "ValueError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile)
                            logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile))
                            logging.exception(e)
                            error_tweet.write(line+"\n")
                            print traceback.format_exc()
                            pass
                        except TypeError, e:
                            lost_tweets += 1
                            print "TypeError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile)
                            logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile))
                            logging.exception(e)
                            error_tweet.write(line+"\n")
                            print traceback.format_exc()
                            pass
                        except KeyError, e:
                            lost_tweets += 1
                            print "KeyError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile)
                            logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile))
                            logging.exception(e)
                            error_tweet.write(line+"\n")
                            print traceback.format_exc()
                            pass
                elif '-streamlimits-' in rawTweetsFile:
                    server_name = os.uname()[1]
                    try:
                        collector_id = rawTweetsFile.split('-')[6]
                        collector = db.get_collector_detail(project_id=project_id, collector_id=collector_id)
                        col_type = collector['collector']['api']
                    except:
                        col_type = 'UNDEFINED'
                    for line in f:
                        line = line.strip()
                        limit_out_string = tweetprocessing.process_limit(line, col_type, server_name, project_name, project_id, collector_id)
                        f_out.write(limit_out_string)
예제 #4
0
            f_out = open(processed_tweets_file,'w')

            tweets_list = []
            tweet_total = 0
            lost_tweets = 0
            line_number = 0

            with open(rawTweetsFile) as f:
                for line in f:

                    try:
                        line_number += 1
                        line = line.strip()

                        tweet_out_string = tweetprocessing.process_tweet(line, track_list, expand_url=EXPAND_URLS)
                        f_out.write(tweet_out_string)
                        tweet_total += 1
                        # print tweet_out_string

                    except ValueError, e:
                        lost_tweets += 1
                        print "ValueError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile)
                        logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile))
                        logging.exception(e)
                        error_tweet.write(line+"\n")
                        print traceback.format_exc()
                        pass
                    except TypeError, e:
                        lost_tweets += 1
                        print "TypeError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile)