示例#1
0
def csv_data_batch(csv_path, target_dataset):
    """
    Generator of dataset records from csv file

    :param csv_path: file to parse
    :ptype csv_file: str
    :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.)
    :ptype target_dataset: str

    :return a batch of records for at most one organization
    :rtype: dict mapping at most one org-id to
            at most BATCH_SIZE (dict) records
    """
    dataset_types = get_dataset_types(target_dataset)
    # Use JSON schema to discover the dataset type to which the file corresponds
    schema_tables = dict((
            t,
            dict((f['label'], f['datastore_id'])
                for f in get_table(t)['fields']))
        for t in dataset_types)
    records = {}
    schema_cols = None
    cols = None
    csv_path = os.path.abspath(os.path.expandvars(os.path.expanduser(csv_path)))
    if os.path.islink(csv_path):
        csv_path = os.readlink(csv_path)
    with open(csv_path) as f:
        csv_in = DictReader(f)
        cols = csv_in.unicode_fieldnames

        for k, v in schema_tables.iteritems():
            if (len(set(v.keys()).intersection(set(cols))) == len(v.keys()) and
                    len(cols) == len(v.keys()) + 2):
                # columns represent all schema data fields + 'Org id', 'Org'
                schema_cols = [v[col] if col in v else col for col in cols]
                break

    assert schema_cols > 0, '{0:s} does not match any dataset type {1}'.format(
        csv_path, dataset_types)

    with open(csv_path) as f:
        # use new dict, each col named for its corresponding JSON datastore_id
        csv_in = DictReader(f, fieldnames=schema_cols)
        csv_in.next()   # skip header row: no new info
        for row_dict in csv_in:
            org_id = row_dict.pop('Org id')
            org = row_dict.pop('Org')
            if org_id not in records:
                if len(records.keys()):
                    org_id_done = records.keys()[0]
                    yield {org_id_done: records.pop(org_id_done)}
                records[org_id] = []

            row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items())
            records[org_id].append(row_dict)
            if len(records[org_id]) >= BATCH_SIZE:
                yield {org_id: records.pop(org_id)}
    yield records
示例#2
0
    "time_zone",
    "uid",
    "user.name",
    "utc_offset",
    "verified",
    "trainingLabel",
]

html_parser = HTMLParser.HTMLParser()

with open(mturk_labeled_filename, 'rb') as mturk_labeled_file_handle:
    mturk_labeled_data_reader = DictReader(mturk_labeled_file_handle,
                                           fieldnames=header,
                                           encoding='utf-8')
    # skip first
    mturk_labeled_data_reader.next()
    # Dictionary to count flags
    flag_count_on_tweets = {}
    for hit in mturk_labeled_data_reader:
        if hit["AssignmentStatus"] != "Approved":
            continue
        tweet_id = hit['Input.id']
        answer = hit['Answer.Q3Answer']
        if tweet_id not in flag_count_on_tweets:
            flag_count_on_tweets[tweet_id] = 0
        if answer != 'N/A':
            flag_count_on_tweets[tweet_id] += 1
    counter = {0: 0, 1: 0, 2: 0, 3: 0}
    with codecs.open(line_separated_tweets_json_file_name, 'r',
                     'utf8') as line_separated_tweets_handle:
        with open(aml_training_dataset_filename,
    "text",
    "time_zone",
    "uid",
    "user.name",
    "utc_offset",
    "verified",
    "trainingLabel",
]

html_parser = HTMLParser.HTMLParser()

with open(mturk_labeled_filename, 'rb') as mturk_labeled_file_handle:
    mturk_labeled_data_reader = DictReader(
        mturk_labeled_file_handle, fieldnames=header, encoding='utf-8')
    # skip first
    mturk_labeled_data_reader.next()
    # Dictionary to count flags
    flag_count_on_tweets = {}
    for hit in mturk_labeled_data_reader:
        if hit["AssignmentStatus"] != "Approved":
            continue
        tweet_id = hit['Input.id']
        answer = hit['Answer.Q3Answer']
        if tweet_id not in flag_count_on_tweets:
            flag_count_on_tweets[tweet_id] = 0
        if answer != 'N/A':
            flag_count_on_tweets[tweet_id] += 1
    counter = {0: 0, 1: 0, 2: 0, 3: 0}
    with codecs.open(line_separated_tweets_json_file_name, 'r', 'utf8') as line_separated_tweets_handle:
        with open(aml_training_dataset_filename, 'wb') as aml_training_dataset_handle:
            csv_writer = unicodecsv.writer(