Exemplo n.º 1
0
def csv_data_batch(csv_path, target_dataset):
    """
    Generator of dataset records from csv file

    :param csv_path: file to parse
    :ptype csv_file: str
    :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.)
    :ptype target_dataset: str

    :return a batch of records for at most one organization
    :rtype: dict mapping at most one org-id to
            at most BATCH_SIZE (dict) records
    """
    dataset_types = get_dataset_types(target_dataset)
    # Use JSON schema to discover the dataset type to which the file corresponds
    schema_tables = dict((
            t,
            dict((f['label'], f['datastore_id'])
                for f in get_table(t)['fields']))
        for t in dataset_types)
    records = {}
    schema_cols = None
    cols = None
    csv_path = os.path.abspath(os.path.expandvars(os.path.expanduser(csv_path)))
    if os.path.islink(csv_path):
        csv_path = os.readlink(csv_path)
    with open(csv_path) as f:
        csv_in = DictReader(f)
        cols = csv_in.unicode_fieldnames

        for k, v in schema_tables.iteritems():
            if (len(set(v.keys()).intersection(set(cols))) == len(v.keys()) and
                    len(cols) == len(v.keys()) + 2):
                # columns represent all schema data fields + 'Org id', 'Org'
                schema_cols = [v[col] if col in v else col for col in cols]
                break

    assert schema_cols > 0, '{0:s} does not match any dataset type {1}'.format(
        csv_path, dataset_types)

    with open(csv_path) as f:
        # use new dict, each col named for its corresponding JSON datastore_id
        csv_in = DictReader(f, fieldnames=schema_cols)
        csv_in.next()   # skip header row: no new info
        for row_dict in csv_in:
            org_id = row_dict.pop('Org id')
            org = row_dict.pop('Org')
            if org_id not in records:
                if len(records.keys()):
                    org_id_done = records.keys()[0]
                    yield {org_id_done: records.pop(org_id_done)}
                records[org_id] = []

            row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items())
            records[org_id].append(row_dict)
            if len(records[org_id]) >= BATCH_SIZE:
                yield {org_id: records.pop(org_id)}
    yield records
Exemplo n.º 2
0
def get_csv_log_reader(csv_logs):
    if PY2:
        csv_stream = BytesIO(csv_logs)
        bom = csv_stream.read(len(BOM_UTF8))
        assert bom == BOM_UTF8, "Unexpected Procmon csv encoding"
        csv_reader = DictReader(csv_stream, encoding='utf-8')
    else:
        csv_stream = StringIO(csv_logs.decode('utf-8-sig'))
        csv_reader = DictReader(csv_stream)
    return csv_reader
Exemplo n.º 3
0
def get_csv_log_reader(csv_logs):
    if PY2:
        csv_logs_utf8 = csv_logs.encode(
            'utf-8')  # I only found a csv library that works for UTF-8
        csv_stream = BytesIO(csv_logs_utf8)
        csv_reader = DictReader(csv_stream, encoding='utf-8')
    else:
        csv_stream = StringIO(csv_logs)
        csv_reader = DictReader(csv_stream)
    return csv_reader
Exemplo n.º 4
0
def get_log_readers(csv_logs, pml_logs):
    pml_stream = BytesIO(pml_logs)
    pml_reader = ProcmonLogsReader(pml_stream)
    if PY2:
        csv_logs_utf8 = csv_logs.encode(
            'utf-8')  # I only found a csv library that works for UTF-8
        csv_stream = BytesIO(csv_logs_utf8)
        csv_reader = DictReader(csv_stream, encoding='utf-8')
    else:
        csv_stream = StringIO(csv_logs)
        csv_reader = DictReader(csv_stream)

    return csv_reader, pml_reader
Exemplo n.º 5
0
def read_csv_logs(csv_path):
    if PY2:
        with io.open(csv_path, "rb") as f:
            bom = f.read(len(BOM_UTF8))
            assert bom == BOM_UTF8, "Unexpected Procmon csv encoding"
            csv_reader = DictReader(f, encoding='utf-8')

            for _ in csv_reader:
                pass
    else:
        with open(csv_path, "r", encoding="utf-8-sig") as f:
            csv_reader = DictReader(f)
            for _ in csv_reader:
                pass
Exemplo n.º 6
0
    def load_public_schools(self):
        from unicodecsv import DictReader
        import dateutil.parser
        from ambry.util import lowercase_dict
        
        table_name = 'public_schools'
        
        p = self.partitions.new_partition(table=table_name)
        
        url = self.metadata.build.public_schools.url

        self.log("Dowloading {}".format(url))
        
        file_name = self.filesystem.download(url)

        self.log("Dowloading {} to {}".format(url, file_name))

        with open(file_name) as f:
            dr = DictReader(f, delimiter='\t', encoding='latin1')

            try: p.query("DELETE FROM {}".format(table_name))
            except: pass
        
            lr = self.init_log_rate(5000,table_name)
            with p.database.inserter(table_name, update_size=True) as ins:
                for i, row in enumerate(dr):  
                    row = lowercase_dict(row)
                    row['id'] = None
                    lr()
                   
                    ins.insert(row)

        return True
Exemplo n.º 7
0
    def _iter_csv(self, fp_raw):
        r = DictReader(fp_raw, delimiter=str(";"), encoding="cp1251")

        mapping = {
            "Найменування": 'name',
            "Скорочена назва": 'short_name',
            "Код ЄДРПОУ": 'edrpou',
            "Місцезнаходження": 'location',
            "ПІБ керівника": 'head',
            "Основний вид діяльності": 'company_profile',
            "Стан": 'status',
        }

        for i, chunk in enumerate(r):
            company = {}

            for k, v in chunk.items():
                if k.strip():
                    if mapping[k] == "edrpou" and v:
                        company[mapping[k]] = int(v)
                    else:
                        company[mapping[k]] = v

            company['founders'] = []
            company["last_update"] = self.timestamp
            company["file_revision"] = self.revision

            if i and i % 50000 == 0:
                logger.warning('Read {} companies from CSV feed'.format(i))

            yield company
Exemplo n.º 8
0
def get_all_officer_fingerprints():
    officer_csv_path = os.path.join(DATA_PATH, 'npo/npo_officers.csv')
    # check cache
    cache_path = _get_cache_path(officer_csv_path)
    if os.path.exists(cache_path):
        with open(cache_path) as f:
            return FingerprintStorage.from_dict(json.loads(f.read()))

    fingerprints = FingerprintStorage()
    total = sum(1 for line in open(officer_csv_path))
    sys.stderr.write("\nMaking officer fingerprints...\n")

    with open(officer_csv_path) as f:
        reader = DictReader(f)
        for i, data in enumerate(reader):
            officer_id = data['officer_id'].strip()
            officer_name = data['officer_name'].strip()
            if not (officer_id and officer_name):
                continue
            fingerprints.put(officer_id, make_fingerprint(officer_name))
            sys.stderr.write("\r%d of %d" % (i + 1, total))
            sys.stderr.flush()

    # write to cache
    with open(cache_path, 'w') as f:
        f.write(json.dumps(fingerprints.to_dict()))

    sys.stderr.write("\nDone\n")
    return fingerprints
Exemplo n.º 9
0
def main():
    prs = argparse.ArgumentParser()

    prs.add_argument('--count', type=int, default=100)

    prs.add_argument('file', type=file)

    args = prs.parse_args()

    count = args.count
    assert count > 0
    path = os.path.abspath(args.file.name)
    root, ext = os.path.splitext(path)
    new_path = '%s_trimmed_%s%s' % (root, count, ext)

    reader = DictReader(open(path))
    new_entries = []
    for i in range(count):
        new_entries.append(next(reader))

    with open(new_path, 'w') as new_file:
        writer = DictWriter(new_file, reader.unicode_fieldnames)
        writer.writeheader()
        writer.writerows(new_entries)

    print open(new_path).read()
Exemplo n.º 10
0
def validate_geodataset_upload(uploaded_file):
    """Validate an uploaded file containing geodataset data

    Because we're using exclusively `TemporaryFileUploadHandler`s we'll always
    have a local file we can open and inspect.
    """

    # Check the extension. We do this instead of calling the
    # FileExtensionValidator because we don't want a bad extension to go any
    # further in this validator.
    extension = os.path.splitext(uploaded_file.name)[-1].lower()
    if extension != '.csv':
        raise ValidationError(
            'Improper file extension "{}". You must upload a CSV'.format(
                extension))

    # Validate the file by opening and inspecting it.
    with open(uploaded_file.temporary_file_path(), 'rb') as file_obj:
        # Start the reader. Handle bad CSVs
        try:
            reader = DictReader(file_obj, encoding='utf-8-sig')
        except csv.Error:
            raise ValidationError(
                'Error processing file. File may not be a valid CSV.')

        if reader.fieldnames[0] != 'ocd_id':
            raise ValidationError('First column must be named \'ocd_id\'')

        seen = []
        for i, fieldname in enumerate(reader.fieldnames):
            clean_field = slugify_header(fieldname)
            if clean_field == '':
                raise ValidationError(
                    u'Column {} header is empty or decodes to empty'.format(i))
            if clean_field in seen:
                raise ValidationError(
                    u'One or more duplicate headers. {}'.format(clean_field))
            seen.append(clean_field)

        ocd_ids = []
        for row in reader:
            ocd_ids.append(row['ocd_id'])

    if not ocd_ids:
        raise ValidationError('File must have at least one entry.')

    # Get all the OCD IDs that match in the database, turn it into a list
    # we can compare to.
    db_ocd_ids = LegislativeDistrict.objects.filter(
        ocd_id__in=list(set(ocd_ids))).values_list('ocd_id', flat=True)

    # Iterate through user-provided OCD IDs and see if they're in the list
    # that came from the database. If they're not, return an error on the
    # first instance.
    for i, ocd_id in enumerate(ocd_ids):
        if ocd_id not in db_ocd_ids:
            raise ValidationError(u'One or more OCD IDs not found. First '
                                  'found: Row: {row} ID: {id}'.format(
                                      row=(i + 2), id=ocd_id))
Exemplo n.º 11
0
def gdocs_persons():
    resp = requests.get(PERSONS_CSV_URL, stream=True)
    resp.raise_for_status()
    reader = DictReader(resp.raw)
    for data in reader:
        if not data['Full Name']:
            continue
        yield data
Exemplo n.º 12
0
    def read_csv(self, csv_url):
        try:
            res = requests.get(csv_url, stream=True)
            res.raise_for_status()
        except requests.exceptions.RequestException as exc:
            log.error('Failed to open CSV [%s]: %s', csv_url, exc)
            return

        if res.encoding is None:
            res.encoding = 'utf-8'
        for row in DictReader(res.iter_lines(decode_unicode=True)):
            yield row
Exemplo n.º 13
0
 def crawl(self):
     logging.warn('starting asx crawl')
     res = requests.get(CSV_URL)
     header, body = res.content.split('\r\n\r\n', 1)
     sio = StringIO(body)
     logging.warn('about to start processing asx')
     for row in list(DictReader(sio)):
         row['source_info'] = header.strip()
         try:
             self.scrape_company(row)
         except Exception, e:
             log.exception(e)
def load_countries(apps, schema_editor):
    country = apps.get_model("core", "Country")

    with open("core/dicts/countries.csv", "r") as fp:
        r = DictReader(fp)

        for l in r:
            country.objects.update_or_create(pk=l["Code"],
                                             iso2=l["Alpha 2"],
                                             iso3=l["Alpha 3"],
                                             name_ua=l["UA"],
                                             name_en=l["UK"])
Exemplo n.º 15
0
def scrape_csv(data):
    _, local_file = mkstemp()
    urllib.urlretrieve(data.get('source_url'), local_file)
    print 'CSV: %(source_url)s' % data
    rows = []
    with open(local_file, 'rn') as fh:
        for row in DictReader(fh):
            row.update(data)
            # row['person_id'] = row.pop('id', None)
            # pprint(row)
            rows.append(row)
    return rows
Exemplo n.º 16
0
def simplerun(fl, stopwords=stopwords):
    results = {}
    fl = open(fl)
    rd = DictReader(fl, encoding='utf-8')
    result = [row for row in rd if row['AU'] != '']
    for item in result:
        item['TI'] = item['TI'].lower().replace('(book)', '')
    ks = classify(result, end=2020)
    f = freqdst(ks, stopwords=stopwords, leaveout=['book'])
    for k in ks:
        tf = termfreq(f, k)
        results[k] = tf
    return results
Exemplo n.º 17
0
    def iter_dataset(self, fp, filetype):
        if filetype == "json":
            for l in json.load(fp):
                yield l

        elif filetype == "jsonlines":
            for l in fp:
                yield json.loads(l)

        elif filetype == "csv":
            r = DictReader(fp)
            for l in r:
                yield l
Exemplo n.º 18
0
    def _build_templates(self):
        lc = LocalCKAN()
        output_files = {}
        next_row = {}
        output_counter = {}
        output_path = self.args[2:][-1]
        table = get_table(DATASET_TYPE)

        def close_write_file(org_id):
            book = output_files[org_id]
            if not book:
                return
            book.save(os.path.join(output_path,
                org_id + '-' + str(output_counter[org_id]) + '.xls'))
            output_files[org_id] = None

        def out_file(org_id):
            if org_id in output_files:
                next_row[org_id] += 1
                # need to start a new file?
                if next_row[org_id] > SPLIT_XLS_ROWS:
                    close_write_file(org_id)
                else:
                    return output_files[org_id], next_row[org_id]
            try:
                org = lc.action.organization_show(id=org_id, include_datasets=False)
            except NotFound:
                print 'org id', org_id, 'not found'
                output_files[org_id] = None
                next_row[org_id] = 0
                return None, None
            book = xls_template(DATASET_TYPE, org)
            output_files[org_id] = book
            output_counter[org_id] = output_counter.get(org_id, 0) + 1
            next_row[org_id] = len(book.get_sheet(0).get_rows())
            return book, next_row[org_id]

        def add_row(book, row, d):
            sheet = book.get_sheet(0)
            for i, f in enumerate(table['fields']):
                sheet.write(row, i, d[f['datastore_id']])

        for f in self.args[1:-1]:
            for d in DictReader(open(f, 'rb')):
                book, row = out_file(d['organization'])
                if not book:
                    continue
                add_row(book, row, d)

        for org_id in output_files:
            close_write_file(org_id)
Exemplo n.º 19
0
def iter_game(game):
    """Iterates through all phases of a game."""

    # raise appropriate exception if gamestate is empty
    if stat(game + ".gamestate").st_size == 0:
        raise ValueError("Game {} has empty gamestate, probably doesn't start "
                         "at the beginning.".format(game))

    if exists(game + ".press.tagged"):
        press_file = game + ".press.tagged"
    else:
        press_file = game + ".press"

    with open(game + ".gamestate", "rb") as f:
        game_state = list(DictReader(f))

    with open(press_file, "rb") as f:
        press = list(DictReader(f, encoding="latin1"))

    with open(game + ".results", "rb") as f:
        order_results = list(DictReader(f))

    nested_state = nested_by_phase(game_state, return_keys=False)
    nested_press = nested_by_phase(press, return_keys=False)
    nested_results, years, seasons, types = nested_by_phase(order_results)

    for year in years:
        for season in seasons:
            for phase_type in types:
                state = nested_state[year][season][phase_type]
                press = nested_press[year][season][phase_type]
                results = nested_results[year][season][phase_type]
                if any(len(k) for k in (state, press, results)):
                    state = nested_state[year][season][phase_type]
                    press = nested_press[year][season][phase_type]
                    results = nested_results[year][season][phase_type]
                    yield (year, season, phase_type, state, press, results)
Exemplo n.º 20
0
    def __init__(self, fname):
        self.all = []
        self.full = {}
        self.groups = []
        self.lt2opencorpora = {}

        with open(fname, "r") as fp:
            r = DictReader(fp)

            for tag in r:
                # lemma form column represents set of tags that wordform should
                # have to be threatened as lemma.
                tag["lemma form"] = [
                    _f for _f in map(str.strip, tag["lemma form"].split(","))
                    if _f
                ]

                tag["divide by"] = [
                    _f for _f in map(str.strip, tag["divide by"].split(","))
                    if _f
                ]

                # opencopropra tags column maps LT tags to OpenCorpora tags
                # when possible
                tag["opencorpora tags"] = (tag["opencorpora tags"]
                                           or tag["name"])

                # Helper mapping
                self.lt2opencorpora[tag["name"]] = tag["opencorpora tags"]

                # Parent column links tag to it's group tag.
                # For example parent tag for noun is POST tag
                # Parent for m (masculine) is gndr (gender group)
                if not hasattr(self, tag["parent"]):
                    setattr(self, tag["parent"], [])

                attr = getattr(self, tag["parent"])
                attr.append(tag["name"])

                # aux is our auxiliary tag to connect our group tags
                if tag["parent"] != "aux":
                    self.all.append(tag["name"])

                # We are storing order of groups that appears here to later
                # sort tags by their groups during export
                if tag["parent"] not in self.groups:
                    self.groups.append(tag["parent"])

                self.full[tag["name"]] = tag
Exemplo n.º 21
0
def as_table(file, limit=None):
    try:
        sio = StringIO(file.data)
        reader = DictReader(sio)
        data = {'headers': None, 'rows': [], 'total': 0}
        for i, row in enumerate(reader):
            if data['headers'] is None:
                data['headers'] = row.keys()
            if limit is None or i < limit:
                rd = [row.get(k) for k in data['headers']]
                data['rows'].append(rd)
            data['total'] = i
        return data
    except CSVError, e:
        return {'status': 'error', 'error': unicode(e)}
Exemplo n.º 22
0
def csv(fh):
    """Read a CSV file and return an iterator of normalised rows."""
    for row in DictReader(fh):
        data = {}
        for k, v in row.items():
            key = slugify(k, sep='_')
            if key is None:
                continue
            v = v.strip()
            if not len(v):
                v = None
            if key in data:
                log.warning("Duplicate column: %s", key)
            data[key] = v
        yield data
Exemplo n.º 23
0
def mappings_import(file):
    """Load decided mappings from a CSV file."""
    for row in DictReader(file):
        left_uid = row.get('left')
        right_uid = row.get('right')
        judgement = parse_boolean(row.get('judgement'), default=None)
        score = None
        if judgement is None:
            left = Entity.get(left_uid)
            right = Entity.get(right_uid)
            score = left.compare(right)
        project.emit_judgement(left_uid,
                               right_uid,
                               judgement,
                               score=score,
                               decided=True)
Exemplo n.º 24
0
def scrape_csv(context, data):
    period = data.get("period")
    country = data.get("country")
    legislature = data.get("legislature")
    start_year = int(period.get('start_date')[:4])
    current_year = datetime.utcnow().year
    # Don't import the US 2nd continental congress (TM):
    if current_year - 10 > start_year:
        return
    res = context.http.get(period.get('csv_url'))
    with open(res.file_path, 'rb') as csvfile:
        for row in DictReader(csvfile):
            context.emit(data={
                "country": country,
                "legislature": legislature,
                "row": row
            })
Exemplo n.º 25
0
def simplecloud(fl, stopwords=stopwords):
    wordclouds = {}
    fl = open(fl)
    rd = DictReader(fl, encoding='utf-8')
    result = [row for row in rd if row['AU'] != '']
    for item in result:
        item['TI'] = item['TI'].lower().replace('(book)', '')
    ks = classify(result, end=2020)
    f = freqdst(ks, stopwords=stopwords)
    for k in ks:
        wordcloud = WordCloud(font_path='/Library/Fonts/Verdana.ttf',
                              relative_scaling=1.0,
                              stopwords=stopwords)
        tf = termfreq(f, k)
        wordcloud.generate_from_frequencies(tf)
        wordclouds[k] = wordcloud
    return wordclouds
Exemplo n.º 26
0
 def rows(self):
     try:
         with open(self.file_name, 'r') as fh:
             sample = fh.read(4096 * 10)
             encoding = guess_encoding(sample)
             if encoding != 'utf-8':
                 log.info("Decode [%s]: %s", self.file_name, encoding)
             sample = sample.decode(encoding, 'replace')
             dialect = Sniffer().sniff(sample)
             fh.seek(0)
             for row in DictReader(
                     fh,
                     encoding=encoding,
                     delimiter=dialect.delimiter.encode(encoding)):
                 yield row
     except Exception as exc:
         log.error('Failed reading file [%s]: %s', self.file_name, exc)
Exemplo n.º 27
0
    def unified_foreign_registry_import(self, request):
        if request.method == "GET":
            return render(
                request,
                "admin/core/company/unified_import.html",
                {"form": ForeignImportForm()},
            )
        if request.method == "POST":
            form = ForeignImportForm(request.POST, request.FILES)

            if not form.is_valid():
                return render(request,
                              "admin/core/company/unified_import.html",
                              {"form": form})

            created_records = 0
            updated_records = 0
            r = DictReader(request.FILES["csv"])
            importer = CompanyImporter(logger=MessagesLogger(request))
            conn_importer = Company2CountryImporter(
                logger=MessagesLogger(request))

            for entry in r:
                company, created = importer.get_or_create_from_unified_foreign_registry(
                    entry)

                if not company:
                    continue

                if created:
                    created_records += 1
                else:
                    updated_records += 1

                country_connection, _ = conn_importer.get_or_create(
                    company,
                    entry.get("country", "").strip(), "registered_in")

            self.message_user(
                request,
                "Створено %s компаній, оновлено %s" %
                (created_records, updated_records),
            )

            return redirect(reverse("admin:core_company_changelist"))
Exemplo n.º 28
0
def carregar_regioes(filename):
    from cidadeiluminada.base import db
    from cidadeiluminada.protocolos.models import Bairro, Regiao
    with open(filename, 'r') as csvfile:
        csvreader = DictReader(csvfile)
        for row in csvreader:
            regiao_ = row['regiao']
            regiao = Regiao.query.filter_by(nome=regiao_).first()
            if not regiao:
                regiao = Regiao(nome=regiao)
                db.session.add(regiao)
            bairro_ = row['bairro']
            bairro = Bairro.query.filter_by(nome=bairro_).first()
            if not bairro:
                bairro = Bairro(nome=bairro_)
                db.session.add(bairro)
            bairro.regiao = regiao
    db.session.commit()
Exemplo n.º 29
0
    def _get_csv_reader(self, *args, **kwargs):
        """Guess CSV dialect, and return CSV reader."""
        # Skip the first line, as csv headers are more likely to have weird
        # character distributions than the actual data.
        self.csvfile.readline()

        # Read a significant chunk of the data to improve the odds of
        # determining the dialect.  MCM is often run on very wide csv files.
        dialect = Sniffer().sniff(self.csvfile.read(16384))
        self.csvfile.seek(0)

        if 'reader_type' not in kwargs:
            return DictReader(self.csvfile, errors='replace')

        else:
            reader_type = kwargs.get('reader_type')
            del kwargs['reader_type']
            return reader_type(self.csvfile, dialect, **kwargs)
Exemplo n.º 30
0
def import_aliases(project, author, path):
    """ Import aliases from a CSV file. This will not create new entities, but
    re-name existing entities or merge two entities if one's name is given as 
    an alias for the other. """
    with open(path, 'r') as fh:
        reader = DictReader(fh)
        for i, row in enumerate(reader):
            data = {}
            for k, v in row.items():
                k = k.lower().strip()
                data[k] = v
            assert 'canonical' in data, 'No "canonical" column!'
            assert 'alias' in data, 'No "alias" column!'
            entities.apply_alias(project, author, data.get('canonical'),
                                 data.get('alias'))
            if i % 1000 == 0:
                db.session.commit()
        db.session.commit()
Exemplo n.º 31
0
    def run(self, filename, state):
        faker = Faker()
        with open(filename) as csvfile:
            # id, title, description, length, need_finance,
            # one_day, type, experience, attendees, size
            reader = DictReader(csvfile)
            count = 0
            for row in reader:
                if Proposal.query.filter_by(title=row['title']).first():
                    continue

                user = User('*****@*****.**' % count, faker.name())
                db.session.add(user)

                proposal = TalkProposal() if row['type'] == u'talk' else\
                    WorkshopProposal() if row['type'] == u'workshop' else\
                    InstallationProposal()

                proposal.state = state
                proposal.title = row['title']
                proposal.description = row['description']

                proposal.one_day = True if row.get('one_day') == 't' else False
                proposal.needs_money = True if row.get(
                    'need_finance') == 't' else False

                if row['type'] == 'talk':
                    proposal.length = row['length']

                elif row['type'] == 'workshop':
                    proposal.length = row['length']
                    proposal.attendees = row['attendees']

                else:
                    proposal.size = row['size']

                proposal.user = user
                db.session.add(proposal)

                db.session.commit()
                count += 1

        app.logger.info('Imported %s proposals' % count)
    "screen_name",
    "sid",
    "statuses_count",
    "text",
    "time_zone",
    "uid",
    "user.name",
    "utc_offset",
    "verified",
    "trainingLabel",
]

html_parser = HTMLParser.HTMLParser()

with open(mturk_labeled_filename, 'rb') as mturk_labeled_file_handle:
    mturk_labeled_data_reader = DictReader(
        mturk_labeled_file_handle, fieldnames=header, encoding='utf-8')
    # skip first
    mturk_labeled_data_reader.next()
    # Dictionary to count flags
    flag_count_on_tweets = {}
    for hit in mturk_labeled_data_reader:
        if hit["AssignmentStatus"] != "Approved":
            continue
        tweet_id = hit['Input.id']
        answer = hit['Answer.Q3Answer']
        if tweet_id not in flag_count_on_tweets:
            flag_count_on_tweets[tweet_id] = 0
        if answer != 'N/A':
            flag_count_on_tweets[tweet_id] += 1
    counter = {0: 0, 1: 0, 2: 0, 3: 0}
    with codecs.open(line_separated_tweets_json_file_name, 'r', 'utf8') as line_separated_tweets_handle: