Exemplo n.º 1
0
def try_dialects(inputfile, fieldnames, dialect):
    #FIXME: does it verify at all if we don't actually step through the file?
    try:
        inputfile.seek(0)
        reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect=dialect, restkey=EXTRA_KEY)
    except csv.Error:
        try:
            inputfile.seek(0)
            reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect='default', restkey=EXTRA_KEY)
        except csv.Error:
            inputfile.seek(0)
            reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect='excel', restkey=EXTRA_KEY)
    return reader
Exemplo n.º 2
0
def readData(input_file,
             field_names,
             delimiter=',',
             prefix=None,
             configuration=None):
    """
    Read in our data from a CSV file and create a dictionary of records, 
    where the key is a unique record ID and each value is a dict 
    of the row fields.

    **Currently, dedupe depends upon records' unique ids being integers
    with no integers skipped. The smallest valued unique id must be 0 or
    1. Expect this requirement will likely be relaxed in the future.**
    """
    if not configuration:
        raise Exception("configuration argument is not really optional")

    data = {}

    reader = csv.DictReader(StringIO(input_file), delimiter=delimiter)
    for i, row in enumerate(reader):
        clean_row = {
            k: preProcess(k, v, configuration)
            for (k, v) in row.items() if k is not None
        }
        if prefix:
            row_id = u"%s|%s" % (prefix, i)
        else:
            row_id = i
        data[row_id] = clean_row

    return data
Exemplo n.º 3
0
def load_data(year):
    '''
    Load data into memory cache
    '''
    year = str(year)
    if year in CACHE:
        return True

    data_file = os.path.join(os.path.dirname(__file__), 'data',
                             '{}.csv'.format(year))
    if not os.path.isfile(data_file):
        return False

    CACHE[year] = {}
    with io.open(data_file, encoding='utf-8') as rf:
        # Detect CSV header line
        has_header = csv.Sniffer().has_header(rf.read(1024))
        rf.seek(0)

        reader = csv.DictReader(rf, DATA_FIELDS)
        if has_header:
            next(reader)

        for data_line in reader:
            day = clean_up_dict(data_line)
            # Convert into `int` type so we don't need to parse it afterwards
            dt = datetime.strptime(day['date'], '%Y-%m-%d')
            day['year'] = dt.year
            day['month'] = dt.month
            day['day'] = dt.day
            day['isholiday'] = bool(int(day['isholiday']))
            day['isworkday'] = bool(int(day['isworkday']))
            CACHE[year][day.pop('date')] = day

    return True
Exemplo n.º 4
0
def readData(input_file, field_names, prefix=None):
    """
    Read in our data from a CSV file and create a dictionary of records,
    where the key is a unique record ID and each value is a dict
    of the row fields.

    **Currently, dedupe depends upon records' unique ids being integers
    with no integers skipped. The smallest valued unique id must be 0 or
    1. Expect this requirement will likely be relaxed in the future.**
    """

    data = {}

    reader = csv.DictReader(StringIO(input_file))
    timer.elapsed(f'Opened file')

    for i, row in enumerate(reader):
        clean_row = {
            k: preProcess(v)
            for (k, v) in row.items() if k is not None
        }
        if prefix:
            row_id = u"%s|%s" % (prefix, i)
        else:
            row_id = i
        data[row_id] = clean_row
        if i % 100000 == 0:
            timer.elapsed(str(i))

    return data
Exemplo n.º 5
0
def delete(api, start, date, r):
    with io.open("tweets.csv", encoding='utf-8') as file:
        count = 0

        for row in csv.DictReader(file):
            tweet_id = int(row["tweet_id"])
            tweet_date = parse(row["timestamp"], ignoretz=True).date()
            if start != "" and tweet_date < parse(start).date():
                continue
            if date != "" and tweet_date >= parse(date).date():
                continue

            if (r == "retweet" and row["retweeted_status_id"] == ""
                    or r == "reply" and row["in_reply_to_status_id"] == ""):
                continue

            try:
                print("Deleting tweet #{0} ({1})".format(tweet_id, tweet_date))

                api.DestroyStatus(tweet_id)
                count += 1
                time.sleep(0.2)

            except twitter.TwitterError as err:
                print("Exception: %s\n" % err.message)

    print("Number of deleted tweets: %s\n" % count)
Exemplo n.º 6
0
 def iter_records(self):
     info_path = os.path.join(self.path, 'data.csv')
     with io.open(info_path, encoding='utf8') as f:
         for row in csv.DictReader(f):
             if row['failed']:
                 continue
             yield row
Exemplo n.º 7
0
 def parse(self, input):
     """parsese the given file or file source string"""
     if hasattr(input, 'name'):
         self.filename = input.name
     elif not getattr(self, 'filename', ''):
         self.filename = ''
     if hasattr(input, "read"):
         tmsrc = input.read()
         input.close()
         input = tmsrc
     if TAB_UTF16 in input.split(b"\n")[0]:
         self.encoding = 'utf-16'
     else:
         self.encoding = 'iso-8859-1'
     try:
         input = input.decode(self.encoding)
     except Exception:
         raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded")
     reader = csv.DictReader(input.split("\n"), fieldnames=WF_FIELDNAMES, dialect="wordfast")
     for idx, line in enumerate(reader):
         if idx == 0:
             header = dict(zip(WF_FIELDNAMES_HEADER, [line[key] for key in WF_FIELDNAMES]))
             self.header = WordfastHeader(header)
             continue
         newunit = WordfastUnit()
         newunit.dict = line
         self.addunit(newunit)
Exemplo n.º 8
0
 def test_read_dict_no_fieldnames(self):
     with TemporaryFile("w+") as fileobj:
         fileobj.write("f1,f2,f3\r\n1,2,abc\r\n")
         fileobj.seek(0)
         reader = csv.DictReader(fileobj)
         self.assertEqual(next(reader), {"f1": '1', "f2": '2', "f3": 'abc'})
         self.assertEqual(reader.fieldnames, ["f1", "f2", "f3"])
Exemplo n.º 9
0
 def _parseCSV(self, file):
     """read CSV from IO using DictReader"""
     i = 1
     reader = csv.DictReader(file)
     self._headers = list(reader.fieldnames)
     for row in reader:
         self._rows.append(row.copy())
         i += 1
Exemplo n.º 10
0
 def trac_fetcher(ticket):
     url = '%s/ticket/%s' % (base_url, ticket)
     response = requests.get(url + '?format=csv')
     if response.status_code == 200:
         reader = csv.DictReader(response.text.split('\n'))
         row = next(reader)
         return url, row.get('summary', None)
     else:
         return url, None
Exemplo n.º 11
0
def parse_csv(data):
    f = io.StringIO(data)
    reader = csv.DictReader(f, dialect="excel-tab", strict=True)
    result = [dict(row) for row in reader]
    for row in result:
        for k in row.keys():
            if not k.isalnum():  # Otherwise the reader would parse just about anything...
                raise csv.Error
    return result
Exemplo n.º 12
0
 def test_read_dict_fieldnames_chain(self):
     import itertools
     with TemporaryFile("w+") as fileobj:
         fileobj.write("f1,f2,f3\r\n1,2,abc\r\n")
         fileobj.seek(0)
         reader = csv.DictReader(fileobj)
         first = next(reader)
         for row in itertools.chain([first], reader):
             self.assertEqual(reader.fieldnames, ["f1", "f2", "f3"])
             self.assertEqual(row, {"f1": '1', "f2": '2', "f3": 'abc'})
Exemplo n.º 13
0
 def test_read_long(self):
     with TemporaryFile("w+") as fileobj:
         fileobj.write("1,2,abc,4,5,6\r\n")
         fileobj.seek(0)
         reader = csv.DictReader(fileobj, fieldnames=["f1", "f2"])
         self.assertEqual(next(reader), {
             "f1": '1',
             "f2": '2',
             None: ["abc", "4", "5", "6"]
         })
Exemplo n.º 14
0
 def test_export(self):
     self.store.export()
     csv_fpath = self.store._CountDataStorage__csv_location
     with open(csv_fpath, newline='', encoding='utf-8') as rf:
         reader = csv.DictReader(rf)
         for row in reader:
             day = datetime.strptime(row['Date'], self.format)
             self.assertEqual(
                 self.store.get(day), int(row['Count']),
                 'Count of date {} should be {}'.format(
                     row['Date'], row['Count']))
Exemplo n.º 15
0
 def test_read_long_with_rest_no_fieldnames(self):
     with TemporaryFile("w+") as fileobj:
         fileobj.write("f1,f2\r\n1,2,abc,4,5,6\r\n")
         fileobj.seek(0)
         reader = csv.DictReader(fileobj, restkey="_rest")
         self.assertEqual(reader.fieldnames, ["f1", "f2"])
         self.assertEqual(next(reader), {
             "f1": '1',
             "f2": '2',
             "_rest": ["abc", "4", "5", "6"]
         })
Exemplo n.º 16
0
 def test_read_semi_sep(self):
     reader = csv.DictReader(["1;2;abc;4;5;6\r\n"],
                             fieldnames="1 2 3 4 5 6".split(),
                             delimiter=';')
     self.assertEqual(next(reader), {
         "1": '1',
         "2": '2',
         "3": 'abc',
         "4": '4',
         "5": '5',
         "6": '6'
     })
Exemplo n.º 17
0
def delete(csv_file, date, r):
    with io.open(csv_file, encoding='utf-8') as tweets_file:
        count = 0

        api = twitter.Api(consumer_key=os.environ['TWITTER_CONSUMER_KEY'],
                          consumer_secret=os.environ['TWITTER_CONSUMER_SECRET'],
                          access_token_key=os.environ['TWITTER_ACCESS_TOKEN'],
                          access_token_secret=os.environ['TWITTER_ACCESS_TOKEN_SECRET'])
        destroyer = TweetDestroyer(api)

        for row in TweetReader(csv.DictReader(tweets_file), date, r).read():
            destroyer.destroy(row["tweet_id"])
            count += 1

        print("Number of deleted tweets: %s\n" % count)
Exemplo n.º 18
0
    def test_read_multi(self):
        sample = [
            '2147483648,43.0e12,17,abc,def\r\n',
            '147483648,43.0e2,17,abc,def\r\n', '47483648,43.0,170,abc,def\r\n'
        ]

        reader = csv.DictReader(sample, fieldnames="i1 float i2 s1 s2".split())
        self.assertEqual(
            next(reader), {
                "i1": '2147483648',
                "float": '43.0e12',
                "i2": '17',
                "s1": 'abc',
                "s2": 'def'
            })
Exemplo n.º 19
0
def load_csv(location):
    """
    Read CSV at `location`, return a list of ordered dictionaries, one
    for each row.
    """
    results = []
    # FIXME: why ignore encoding errors here?
    with codecs.open(location,
                     mode='rb',
                     encoding='utf-8-sig',
                     errors='ignore') as csvfile:
        for row in csv.DictReader(csvfile):
            # convert all the column keys to lower case
            updated_row = OrderedDict([(key.lower(), value)
                                       for key, value in row.items()])
            results.append(updated_row)
    return results
Exemplo n.º 20
0
def main():
    parser = argparse.ArgumentParser()
    org_files = os.environ.get("ORG", "~/org")
    parser.add_argument(
        "--contacts",
        "-c",
        default=os.path.join(org_files, "contacts.csv"),
        help="""The CSV file containing the contact details.""")
    parser.add_argument("--flag",
                        "-f",
                        action='store_true',
                        help="""Search by flag.""")
    parser.add_argument("names", nargs='*', help="""The names to look for.""")
    args = parser.parse_args()
    by_name = {}
    by_id = {}
    with io.open(args.contacts, 'r', encoding='utf-8') as input:
        contacts_reader = csv.DictReader(input)
        for row in contacts_reader:
            for multi in multi_fields:
                row[multi] = (row.get(multi, "") or "").split()
            n = make_name(row)
            row['_name_'] = n
            by_name[n] = row
            by_id[id] = row.get('ID', "")
    if args.flag != "":
        flag = args.names[0]
        by_address = {}
        for n in sorted(by_name.keys()):
            who = by_name[n]
            if flag in who['Flags']:
                addr = assemble_postal_address(who, "\n  ")
                if addr not in by_address:
                    by_address[addr] = []
                by_address[addr].append(who)
        for addr in sorted(by_address.keys()):
            sys.stdout.write(
                make_name_list(by_address[addr]) + "\n  " + addr + "\n\n")
    else:
        name = " ".join(args.names)
        if name in by_name:
            show_person(sys.stdout, by_name[name])
        else:
            for n in sorted(by_name.keys()):
                if name in n:
                    show_person(sys.stdout, by_name[n])
Exemplo n.º 21
0
def load_districts(csv_file):
    vbd = {}
    g = {}

    with io.open(csv_file, encoding='utf-8') as c:
        reader = csv.DictReader(c)  #,delimiter=',', quotechar='"')
        for row in reader:
            vil = row[VILLAGE]
            dis = row[DISTRICT][6:]

            if dis in vbd.keys():
                vbd[dis] += [vil]
            else:
                vbd[dis] = [vil]

            g[vil + dis] = (row[LAT], row[LON])

    return vbd, g
Exemplo n.º 22
0
    def all(self, country, from_csv=None):

        file_handle = None

        # check for environment variable
        if not from_csv and 'OCD_DIVISION_CSV' in os.environ:
            from_csv = os.environ.get('OCD_DIVISION_CSV').format(country)
            try:
                file_handle = io.open(from_csv, encoding='utf8')
            except FileNotFoundError:
                raise ValueError("Unknown country in OCD ID")

        # going to the remote URL
        if not file_handle:
            file_handle = io.StringIO(
                urlopen(OCD_REMOTE_URL.format(country)).read().decode('utf-8'))

        for row in csv.DictReader(file_handle):
            yield Division(**row)
Exemplo n.º 23
0
    def all(self, country, from_csv=None):
        file_handle = None

        # Load from CSV if `from_csv` or `OCD_DIVISION_CSV` are set.
        if from_csv or 'OCD_DIVISION_CSV' in os.environ:
            if not from_csv:
                from_csv = os.environ.get('OCD_DIVISION_CSV').format(country)
            try:
                file_handle = io.open(from_csv, encoding='utf8')
            except FileNotFoundError:
                raise ValueError("Couldn't open CSV file {}".format(from_csv))

        # Load from URL otherwise.
        if not file_handle:
            file_handle = io.StringIO(
                urlopen(OCD_REMOTE_URL.format(country)).read().decode('utf-8'))

        for row in csv.DictReader(file_handle):
            yield Division(**row)
Exemplo n.º 24
0
def anonymize_file(source,
                   dest,
                   csvheaderformatdict=None,
                   ignorementions=False):
    print('Reading from [{0}] and writing anonymized data to [{1}]...'.format(
        source, dest))
    with io.open(source, 'r', encoding='utf8') as f:
        #with io.open(source, 'r') as f:
        with io.open(dest, 'w', encoding='utf8') as o:
            reader = csv.DictReader(f)
            if not csvheaderformatdict:
                fieldnames = consts.defaultHeader
            else:
                fieldnames = reader.fieldnames
            writer = csv.DictWriter(o, fieldnames)
            writer.writeheader()
            for row in reader:
                anonymize_row(row, fieldnames, csvheaderformatdict,
                              ignorementions)
                writer.writerow(row)
Exemplo n.º 25
0
 def test_read_with_blanks(self):
     reader = csv.DictReader(
         ["1,2,abc,4,5,6\r\n", "\r\n", "1,2,abc,4,5,6\r\n"],
         fieldnames="1 2 3 4 5 6".split())
     self.assertEqual(next(reader), {
         "1": '1',
         "2": '2',
         "3": 'abc',
         "4": '4',
         "5": '5',
         "6": '6'
     })
     self.assertEqual(next(reader), {
         "1": '1',
         "2": '2',
         "3": 'abc',
         "4": '4',
         "5": '5',
         "6": '6'
     })
Exemplo n.º 26
0
def read_csv_database(database_path):
    """Read database CSV file, providing one line at a time.

    We'll use a class to modify the csv library's default dialect ('excel') to
    enable strict syntax checking.  This will trigger errors for things like
    unclosed quotes.
    """
    class StrictExcel(csv.excel):
        # Our helper class is really simple
        # pylint: disable=too-few-public-methods, missing-class-docstring
        strict = True

    with database_path.open(mode="r", encoding="utf-8") as database_file:
        reader = csv.DictReader(database_file, dialect=StrictExcel)
        try:
            for row in reader:
                yield row
        except csv.Error as err:
            raise MailmergeError("{}:{}: {}".format(database_path,
                                                    reader.line_num, err))
Exemplo n.º 27
0
def get_job_results(results_file, encoding="utf8"):
    """
    Read in results from a results file (results.csv.gz)
    Generate: dict
    """
    if not results_file.endswith(".csv.gz"):
        raise NotImplementedError(
            "Expecting results file to be a .csv.gz file.")

    mv_fields = None
    with gzip.open(results_file, "rt", encoding=encoding) as stream:
        for row in csv.DictReader(stream):
            # Remove __mv_ fields; replacing the origion fields with lists of values, were necessary
            if mv_fields is None:
                mv_fields = [(f, f[5:]) for f in row if f.startswith("__mv_")]
            if mv_fields:
                for mv_field, field in mv_fields:
                    if row[mv_field]:
                        row[field] = _decode_mv_field(row[mv_field])
                    del row[mv_field]
            yield row
Exemplo n.º 28
0
def people_to_qualtrics_csv(hub, repo_tools_data, frequency, update):
    """
    Print out a formatted file as expected by Qualtrics import.
    """

    if update is not None:
        with open(update, newline='', encoding='utf-8') as update_data:
            reader = csv.DictReader(update_data)
            initial = {row[EMAIL]: row for row in reader}
        fields = [field for field in reader.fieldnames if field]
    else:
        initial = {}
        fields = [NAME, EMAIL, WEEK, ASSOCIATED_WITH, UNSUBSCRIBED]

    csv_writer = csv.DictWriter(click.get_text_stream('stdout'),
                                fieldnames=fields,
                                extrasaction='ignore')
    csv_writer.writeheader()
    for username, person in repo_tools_data.people.iteritems():
        if person.email is None:
            continue

        hashdigest = hashlib.md5(person.email.lower()).hexdigest()

        row = initial.get(person.email, {})
        row.update({
            NAME:
            person.name,
            EMAIL:
            person.email,
            WEEK:
            int(hashdigest, 16) % frequency + 1,
            ASSOCIATED_WITH:
            'edX' if person.associated_with('edX', 'ArbiSoft') else 'other',
        })

        if not person.email_ok:
            row[UNSUBSCRIBED] = 'true'

        csv_writer.writerow(row)
Exemplo n.º 29
0
 def parse(self, input):
     """parsese the given file or file source string"""
     if hasattr(input, 'name'):
         self.filename = input.name
     elif not getattr(self, 'filename', ''):
         self.filename = ''
     if hasattr(input, "read"):
         tmsrc = input.read()
         input.close()
         input = tmsrc
     input = input.decode(self.encoding)
     try:
         header_length = self._read_header(input)
     except Exception:
         raise base.ParseError("Cannot parse header")
     lines = csv.DictReader(
         input.split(UtxDialect.lineterminator)[header_length:],
         fieldnames=self._fieldnames,
         dialect="utx")
     for line in lines:
         newunit = UtxUnit()
         newunit.dict = line
         self.addunit(newunit)
Exemplo n.º 30
0
 def parse(self, input):
     """parsese the given file or file source string"""
     if hasattr(input, 'name'):
         self.filename = input.name
     elif not getattr(self, 'filename', ''):
         self.filename = ''
     if hasattr(input, "read"):
         tmsrc = input.read()
         input.close()
         input = tmsrc
     try:
         input = input.decode(self.encoding)
     except Exception:
         raise ValueError(
             "OmegaT files are either UTF-8 encoded or use the default system encoding"
         )
     lines = csv.DictReader(input.split("\n"),
                            fieldnames=OMEGAT_FIELDNAMES,
                            dialect="omegat")
     for line in lines:
         newunit = OmegaTUnit()
         newunit.dict = line
         self.addunit(newunit)