def sample_tweets(authors_list, num_tweets, features): tweets_sampled = {} for author_filename in authors_list: logging.debug(''.join([ '\tSampling tweets from author ', os.path.basename(author_filename), ' ...' ])) tweets = messages_persistence.read(author_filename) random.shuffle(tweets) author_id = os.path.splitext(os.path.basename(author_filename))[0] tweets_sampled[author_id] = tweets[0:num_tweets] return tweets_sampled
])) sys.exit(1) os.makedirs(args.dest_dir) logging.info('Filtering tweets by language ...') author_filenames = glob.glob(''.join( [args.source_dir_data, os.sep, '*.dat'])) num_files = len(author_filenames) # processing feedback i = 0 # processing feedback for author_filename in author_filenames: sys.stdout.write(''.join( ['\t', str(i), '/', str(num_files), ' files processed\r'])) i += 1 logging.debug(''.join(['Processing ', author_filename, ' file ...'])) messages = messages_persistence.read(author_filename) messages_filtered = [] for message in messages: logging.debug(''.join( ['Detecting language for tweet: ', message['tweet']])) try: # code guess-language breaks for some tweets detected_language = guess_language.guessLanguageName( message['tweet']) except Exception as e: logging.warning( 'guess-language library error in detecting language for tweet: ' + message['tweet']) logging.warning('Exception message: ' + str(e)) logging.warning('Exception stack trace:') traceback.print_tb(sys.exc_info()[2]) detected_language = None
# logging configuration logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO, format='[%(asctime)s] - %(levelname)s - %(message)s') logging.info(''.join(['Starting generating n-grams ...', '\n\tsource directory data = ', args.source_dir_data, '\n\toutput directory = ', args.dest_dir, '\n\tfeatures = ', str(args.features), '\n\tdebug = ', str(args.debug), ])) logging.info('Creating output directory ...') if os.path.exists(args.dest_dir): logging.error('Output directory already exists. Quitting ...') sys.exit(1) os.makedirs(args.dest_dir) author_dirnames = glob.glob(os.sep.join([args.source_dir_data, '*.dat'])) num_files = len(author_dirnames) # processing feedback i = 0 # processing feedback logging.info('Reading dataset and generating n-grams ...') for filename in author_dirnames: sys.stderr.write(''.join(['\t', str(i), '/', str(num_files), ' files processed\r'])) # processing feedback i += 1 logging.debug(''.join(['Reading tweets and generating n-grams for file ', filename, ' ...'])) author_dir = os.sep.join([args.dest_dir, os.path.splitext(os.path.basename(filename))[0]]) os.makedirs(author_dir) ngrams_generator(messages_persistence.read(filename), args.features, author_dir) logging.info('Finishing ...')
logging.error('Destination directory already exists. Quitting ...') sys.exit(1) os.makedirs(args.dest_dir) logging.info('Tagging tweets ...') filenames = glob.glob(''.join([args.source_dir_data, os.sep, '*.dat'])) i = 0 # processing feedback for filename in filenames: sys.stderr.write(''.join( ['\t', str(i), '/', str(len(filenames)), ' files processed\r'])) # processing feedback i += 1 # processing feedback logging.debug(''.join(['Processing file ', filename, ' ...'])) messages = messages_persistence.read(filename) messages_tagged = [] for message in messages: tagged = message['tweet'] if not args.no_url: tagged = tag_url(tagged) if not args.no_userref: tagged = tag_userref(tagged) if not args.no_hashtag: tagged = tag_hashtag(tagged) if not args.no_date: tagged = tag_date(tagged) if not args.no_time: tagged = tag_time(tagged) if not args.no_number: tagged = tag_number(tagged)