예제 #1
0
def sample_tweets(authors_list, num_tweets, features):
    tweets_sampled = {}
    for author_filename in authors_list:
        logging.debug(''.join([
            '\tSampling tweets from author ',
            os.path.basename(author_filename), ' ...'
        ]))
        tweets = messages_persistence.read(author_filename)
        random.shuffle(tweets)
        author_id = os.path.splitext(os.path.basename(author_filename))[0]
        tweets_sampled[author_id] = tweets[0:num_tweets]
    return tweets_sampled
예제 #2
0
        ]))
        sys.exit(1)
    os.makedirs(args.dest_dir)

    logging.info('Filtering tweets by language ...')
    author_filenames = glob.glob(''.join(
        [args.source_dir_data, os.sep, '*.dat']))
    num_files = len(author_filenames)  # processing feedback
    i = 0  # processing feedback
    for author_filename in author_filenames:
        sys.stdout.write(''.join(
            ['\t', str(i), '/',
             str(num_files), ' files processed\r']))
        i += 1
        logging.debug(''.join(['Processing ', author_filename, ' file ...']))
        messages = messages_persistence.read(author_filename)
        messages_filtered = []
        for message in messages:
            logging.debug(''.join(
                ['Detecting language for tweet: ', message['tweet']]))
            try:  # code guess-language breaks for some tweets
                detected_language = guess_language.guessLanguageName(
                    message['tweet'])
            except Exception as e:
                logging.warning(
                    'guess-language library error in detecting language for tweet: '
                    + message['tweet'])
                logging.warning('Exception message: ' + str(e))
                logging.warning('Exception stack trace:')
                traceback.print_tb(sys.exc_info()[2])
                detected_language = None
예제 #3
0
    # logging configuration
    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO, format='[%(asctime)s] - %(levelname)s - %(message)s')

    logging.info(''.join(['Starting generating n-grams ...',
                           '\n\tsource directory data = ', args.source_dir_data,
                           '\n\toutput directory = ', args.dest_dir,
                           '\n\tfeatures = ', str(args.features),
                           '\n\tdebug = ', str(args.debug),
                         ]))

    logging.info('Creating output directory ...')
    if os.path.exists(args.dest_dir):
        logging.error('Output directory already exists. Quitting ...')
        sys.exit(1)
    os.makedirs(args.dest_dir)

    author_dirnames = glob.glob(os.sep.join([args.source_dir_data, '*.dat']))
    num_files = len(author_dirnames)    # processing feedback
    i = 0                               # processing feedback
    logging.info('Reading dataset and generating n-grams ...')
    for filename in author_dirnames:
        sys.stderr.write(''.join(['\t', str(i), '/', str(num_files), ' files processed\r']))   # processing feedback
        i += 1
        logging.debug(''.join(['Reading tweets and generating n-grams for file ', filename, ' ...']))
        author_dir = os.sep.join([args.dest_dir, os.path.splitext(os.path.basename(filename))[0]])
        os.makedirs(author_dir)
        ngrams_generator(messages_persistence.read(filename), args.features, author_dir)

    logging.info('Finishing ...')
    
        logging.error('Destination directory already exists. Quitting ...')
        sys.exit(1)
    os.makedirs(args.dest_dir)

    logging.info('Tagging tweets ...')
    filenames = glob.glob(''.join([args.source_dir_data, os.sep, '*.dat']))
    i = 0  # processing feedback
    for filename in filenames:
        sys.stderr.write(''.join(
            ['\t',
             str(i), '/',
             str(len(filenames)),
             ' files processed\r']))  # processing feedback
        i += 1  # processing feedback
        logging.debug(''.join(['Processing file ', filename, ' ...']))
        messages = messages_persistence.read(filename)
        messages_tagged = []
        for message in messages:
            tagged = message['tweet']
            if not args.no_url:
                tagged = tag_url(tagged)
            if not args.no_userref:
                tagged = tag_userref(tagged)
            if not args.no_hashtag:
                tagged = tag_hashtag(tagged)
            if not args.no_date:
                tagged = tag_date(tagged)
            if not args.no_time:
                tagged = tag_time(tagged)
            if not args.no_number:
                tagged = tag_number(tagged)