def csv_data_batch(csv_path, target_dataset): """ Generator of dataset records from csv file :param csv_path: file to parse :ptype csv_file: str :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.) :ptype target_dataset: str :return a batch of records for at most one organization :rtype: dict mapping at most one org-id to at most BATCH_SIZE (dict) records """ dataset_types = get_dataset_types(target_dataset) # Use JSON schema to discover the dataset type to which the file corresponds schema_tables = dict(( t, dict((f['label'], f['datastore_id']) for f in get_table(t)['fields'])) for t in dataset_types) records = {} schema_cols = None cols = None csv_path = os.path.abspath(os.path.expandvars(os.path.expanduser(csv_path))) if os.path.islink(csv_path): csv_path = os.readlink(csv_path) with open(csv_path) as f: csv_in = DictReader(f) cols = csv_in.unicode_fieldnames for k, v in schema_tables.iteritems(): if (len(set(v.keys()).intersection(set(cols))) == len(v.keys()) and len(cols) == len(v.keys()) + 2): # columns represent all schema data fields + 'Org id', 'Org' schema_cols = [v[col] if col in v else col for col in cols] break assert schema_cols > 0, '{0:s} does not match any dataset type {1}'.format( csv_path, dataset_types) with open(csv_path) as f: # use new dict, each col named for its corresponding JSON datastore_id csv_in = DictReader(f, fieldnames=schema_cols) csv_in.next() # skip header row: no new info for row_dict in csv_in: org_id = row_dict.pop('Org id') org = row_dict.pop('Org') if org_id not in records: if len(records.keys()): org_id_done = records.keys()[0] yield {org_id_done: records.pop(org_id_done)} records[org_id] = [] row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items()) records[org_id].append(row_dict) if len(records[org_id]) >= BATCH_SIZE: yield {org_id: records.pop(org_id)} yield records
"time_zone", "uid", "user.name", "utc_offset", "verified", "trainingLabel", ] html_parser = HTMLParser.HTMLParser() with open(mturk_labeled_filename, 'rb') as mturk_labeled_file_handle: mturk_labeled_data_reader = DictReader(mturk_labeled_file_handle, fieldnames=header, encoding='utf-8') # skip first mturk_labeled_data_reader.next() # Dictionary to count flags flag_count_on_tweets = {} for hit in mturk_labeled_data_reader: if hit["AssignmentStatus"] != "Approved": continue tweet_id = hit['Input.id'] answer = hit['Answer.Q3Answer'] if tweet_id not in flag_count_on_tweets: flag_count_on_tweets[tweet_id] = 0 if answer != 'N/A': flag_count_on_tweets[tweet_id] += 1 counter = {0: 0, 1: 0, 2: 0, 3: 0} with codecs.open(line_separated_tweets_json_file_name, 'r', 'utf8') as line_separated_tweets_handle: with open(aml_training_dataset_filename,
"text", "time_zone", "uid", "user.name", "utc_offset", "verified", "trainingLabel", ] html_parser = HTMLParser.HTMLParser() with open(mturk_labeled_filename, 'rb') as mturk_labeled_file_handle: mturk_labeled_data_reader = DictReader( mturk_labeled_file_handle, fieldnames=header, encoding='utf-8') # skip first mturk_labeled_data_reader.next() # Dictionary to count flags flag_count_on_tweets = {} for hit in mturk_labeled_data_reader: if hit["AssignmentStatus"] != "Approved": continue tweet_id = hit['Input.id'] answer = hit['Answer.Q3Answer'] if tweet_id not in flag_count_on_tweets: flag_count_on_tweets[tweet_id] = 0 if answer != 'N/A': flag_count_on_tweets[tweet_id] += 1 counter = {0: 0, 1: 0, 2: 0, 3: 0} with codecs.open(line_separated_tweets_json_file_name, 'r', 'utf8') as line_separated_tweets_handle: with open(aml_training_dataset_filename, 'wb') as aml_training_dataset_handle: csv_writer = unicodecsv.writer(