Пример #1
0
    def load_gtfs(self, gtfs_filename, tables=None, reporter=None, verbose=False):
        c = self.conn.cursor()

        if not os.path.isdir( gtfs_filename ):
            zf = ZipFile( gtfs_filename )

        for tablename, table_def in self.GTFS_DEF:
            filename = tablename[5:] + '.txt'

            if tables is not None and tablename not in tables:
                print( "skipping table %s - not included in 'tables' list" % tablename )
                continue

            print( "creating table %s\n"%tablename )
            create_table( c, tablename, table_def )
            print( "loading table %s\n"%tablename )

            try:
                if not os.path.isdir( gtfs_filename ):
                    trips_file = iterdecode( zf.read(filename).split("\n"), "utf-8" )
                else:
                    trips_file = iterdecode( open( os.path.join( gtfs_filename, filename ) ), "utf-8" )
                load_gtfs_table_to_sqlite(trips_file, tablename, c, table_def, verbose=verbose)
            except (KeyError, IOError):
                print( "NOTICE: GTFS feed has no file %s.txt, cannot load\n"%tablename )

        self._create_indices(c)
        self.conn.commit()
        c.close()
Пример #2
0
    def testIncrementalDecoder(self):

        # Tests derived from Python standard library test/test_codecs.py

        incremental_tests = (
            (u"python.org", b"python.org"),
            (u"python.org.", b"python.org."),
            (u"pyth\xf6n.org", b"xn--pythn-mua.org"),
            (u"pyth\xf6n.org.", b"xn--pythn-mua.org."),
        )

        for decoded, encoded in incremental_tests:
            if sys.version_info[0] == 2:
                self.assertEqual("".join(codecs.iterdecode(encoded, "idna")),
                                decoded)
            else:
                self.assertEqual("".join(codecs.iterdecode((bytes([c]) for c in encoded), "idna")),
                                decoded)

        decoder = codecs.getincrementaldecoder("idna")()
        self.assertEqual(decoder.decode(b"xn--xam", ), u"")
        self.assertEqual(decoder.decode(b"ple-9ta.o", ), u"\xe4xample.")
        self.assertEqual(decoder.decode(b"rg"), u"")
        self.assertEqual(decoder.decode(b"", True), u"org")

        decoder.reset()
        self.assertEqual(decoder.decode(b"xn--xam", ), u"")
        self.assertEqual(decoder.decode(b"ple-9ta.o", ), u"\xe4xample.")
        self.assertEqual(decoder.decode(b"rg."), u"org.")
        self.assertEqual(decoder.decode(b"", True), u"")
Пример #3
0
def convert_column(data, schemae):
    """Convert known types from primitive to rich."""
    ctype = schemae.converted_type
    if ctype == parquet_thrift.ConvertedType.DECIMAL:
        scale_factor = Decimal("10e-{}".format(schemae.scale))
        if schemae.type == parquet_thrift.Type.INT32 or schemae.type == parquet_thrift.Type.INT64:
            return [Decimal(unscaled) * scale_factor for unscaled in data]
        return [Decimal(intbig(unscaled)) * scale_factor for unscaled in data]
    elif ctype == parquet_thrift.ConvertedType.DATE:
        return [datetime.date.fromordinal(d) for d in data]
    elif ctype == parquet_thrift.ConvertedType.TIME_MILLIS:
        return [datetime.timedelta(milliseconds=d) for d in data]
    elif ctype == parquet_thrift.ConvertedType.TIMESTAMP_MILLIS:
        return [datetime.datetime.utcfromtimestamp(d / 1000.0) for d in data]
    elif ctype == parquet_thrift.ConvertedType.UTF8:
        return list(codecs.iterdecode(data, "utf-8"))
    elif ctype == parquet_thrift.ConvertedType.UINT_8:
        return _convert_unsigned(data, 'b')
    elif ctype == parquet_thrift.ConvertedType.UINT_16:
        return _convert_unsigned(data, 'h')
    elif ctype == parquet_thrift.ConvertedType.UINT_32:
        return _convert_unsigned(data, 'i')
    elif ctype == parquet_thrift.ConvertedType.UINT_64:
        return _convert_unsigned(data, 'q')
    elif ctype == parquet_thrift.ConvertedType.JSON:
        return [json.loads(s) for s in codecs.iterdecode(data, "utf-8")]
    elif ctype == parquet_thrift.ConvertedType.BSON and bson:
        return [bson.BSON(s).decode() for s in data]
    else:
        logger.info("Converted type '%s'' not handled",
                    parquet_thrift.ConvertedType._VALUES_TO_NAMES[ctype])  # pylint:disable=protected-access
    return data
Пример #4
0
    def test_incremental_decode(self):
        self.assertEquals(
            "".join(codecs.iterdecode("python.org", "idna")),
            u"python.org"
        )
        self.assertEquals(
            "".join(codecs.iterdecode("python.org.", "idna")),
            u"python.org."
        )
        self.assertEquals(
            "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
            u"pyth\xf6n.org."
        )
        self.assertEquals(
            "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
            u"pyth\xf6n.org."
        )

        decoder = codecs.getincrementaldecoder("idna")()
        self.assertEquals(decoder.decode("xn--xam", ), u"")
        self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
        self.assertEquals(decoder.decode(u"rg"), u"")
        self.assertEquals(decoder.decode(u"", True), u"org")

        decoder.reset()
        self.assertEquals(decoder.decode("xn--xam", ), u"")
        self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
        self.assertEquals(decoder.decode("rg."), u"org.")
        self.assertEquals(decoder.decode("", True), u"")
Пример #5
0
    def upsert_dataset(self, datasetCode):
        if datasetCode=='esri':
            url = self.url_amount[0]
            response = urllib.request.urlopen(url)
            draft = csv.reader(codecs.iterdecode(response, 'latin-1'), delimiter=',')
            included_cols = [0]
            list_csv = []
            year_draft = []
            for rrow in draft:
                list_csv.append(rrow)
                year_draft.append(list(rrow[i] for i in included_cols))

            #Generating the year regarding to the standard format
            #year = year_draft[7:]
            #year_last = year[-4][0][:4]+'q4' 
            #period_index = pandas.period_range(year[0][0][:4], year_last , freq = 'quarterly')
            year = year_draft[7:]
            year_last = year[-4][0][:4]+'q4' 
            period_index = pandas.period_range(year[0][0][:4], year_last , freq = 'quarterly')
            list_csv[5][0] = 'year'

            # flatens the tree structure
            for i, j in enumerate(list_csv[6]):
                if j != '':
                    if list_csv[5][i] != '':
                        keep = list_csv[5][i]
                        list_csv[5][i] = list_csv[5][i] + '_' + list_csv[6][i]          
                    else :
                        list_csv[5][i] = keep + '_' + list_csv[6][i]
            
            dimensionList_content = []  
            response = urllib.request.urlopen(url)
            reader = csv.DictReader(codecs.iterdecode(response, 'latin-1'), fieldnames=list_csv[5] ,delimiter=',')
            for i in range(len(reader.fieldnames)):
                if reader.fieldnames[i]!='' and reader.fieldnames[i] != 'year'  :
                    dimensionList_content.append(reader.fieldnames[i])
            
            dimensionList = {'content':dimensionList_content}
                    
            datasetCode = 'esri'
            releaseDates =response.getheaders()[0][1] 
            lastUpdate = datetime.datetime.strptime(releaseDates[5:], "%d %b %Y %H:%M:%S GMT")

            document = Dataset(provider = 'esri', 
                       name = year_draft[1][0] ,
                       datasetCode = 'esri', lastUpdate = lastUpdate,
                       dimensionList = dimensionList, 
                       docHref = "http://www.cao.go.jp/index-e.html") 
            #print(document)           
            effective_dimension_list = self.update_series('esri', dimensionList)
            document.update_database()
            #print(effective_dimension_list)
            effective_dimension_list = self.update_series('esri', dimensionList)
            document.update_database()
            document.update_es_database(effective_dimension_list)
        else:
            raise Exception("The name of dataset was not entered!")        
Пример #6
0
 def get_rows(self, filename):
   if self.zf:
     try:
       contents = self.zf.read(filename)
     except KeyError:
       raise KeyError( "%s is not present feed"%filename )
     return csv.reader( iterdecode( contents.split("\n"), "utf-8" ) )
   else:
     return csv.reader( iterdecode( open( os.path.join( self.filename, filename ) ), "utf-8" ) )
Пример #7
0
def vola_importer(url="https://raw.githubusercontent.com/flyingeek/editolido/gh-pages/ext-sources/vola_legacy_report.txt"):
    # https://oscar.wmo.int/oscar/vola/vola_legacy_report.txt
    if PY2:
        delimiter = b'\t'
        data = urlopen(url)
    else:
        delimiter = '\t'
        import codecs
        data = codecs.iterdecode(urlopen(url), 'utf-8')
    reader = csv.reader(data, delimiter=delimiter, quoting=csv.QUOTE_NONE)

    def geo_normalize(value):
        # recognize NSEW or undefined (which is interpreted as North)
        orientation = value[-1]
        sign = -1 if orientation in 'SW' else 1
        coords = value if orientation not in 'NEWS' else value[:-1]
        coords += ' 0 0'  # ensure missing seconds or minutes are 0
        degrees, minutes, seconds = map(float, coords.split(' ', 3)[:3])
        return sign * (degrees + (minutes / 60) + (seconds / 3600))

    headers = next(reader)
    for row in reader:
        name = row[5]
        if not name:
            continue
        yield name, geo_normalize(row[9]), geo_normalize(row[8]), row[28].split(', ')
Пример #8
0
 def __iter__(self):
     if self.is_zipped:
         byte_stream = BytesIO(self.response.content)
         with zipfile.ZipFile(byte_stream) as self.zipfile:
             for name in self.zipfile.namelist():
                 with self.zipfile.open(name) as single_file:
                     if name[-3:] == 'csv':
                         reader = csv.reader(single_file, delimiter=self.delimiter)
                     else:
                         reader = single_file
                     reader_iterator = iter(reader)
                     if self.is_header_present:
                         next(reader_iterator)
                     for line in reader_iterator:
                         yield self._parse_line(line)
         byte_stream.close()
     else:
         stream = codecs.iterdecode(self.response.iter_lines(),
                                    self.response.encoding or self.response.apparent_encoding)
         reader = csv.reader(stream, delimiter=self.delimiter)
         reader_iterator = iter(reader)
         if self.is_header_present:
             next(reader_iterator)
         for line in reader_iterator:
             yield self._parse_line(line)
         stream.close()
Пример #9
0
 def unicode_csv_reader(self, file_handle, encoding='utf-8'):
     if encoding == 'utf-8':
         encoding_sig = 'utf-8-sig'
     reader = csv.reader([x.encode(encoding) for x in iterdecode(file_handle, encoding_sig)])
     for row in reader:
         yield [unicode(x, encoding) for x in row]
     return
Пример #10
0
 def _handle_output(self, buffer_, hide, output, reader, indices):
     # Create a generator yielding stdout data.
     # NOTE: Typically, reading from any stdout/err (local, remote or
     # otherwise) can be thought of as "read until you get nothing back".
     # This is preferable over "wait until an out-of-band signal claims the
     # process is done running" because sometimes that signal will appear
     # before we've actually read all the data in the stream (i.e.: a race
     # condition).
     def get():
         while True:
             data = reader(self.read_chunk_size)
             if not data:
                 break
             yield self.encode(data)
     # Use that generator in iterdecode so it ends up in our local encoding.
     for data in codecs.iterdecode(
         get(), self.encoding, errors='replace'
     ):
         # Echo to local stdout if necessary
         # TODO: should we rephrase this as "if you want to hide, give me a
         # dummy output stream, e.g. something like /dev/null"? Otherwise, a
         # combo of 'hide=stdout' + 'here is an explicit out_stream' means
         # out_stream is never written to, and that seems...odd.
         if not hide:
             output.write(data)
             output.flush()
         # Store in shared buffer so main thread can do things with the
         # result after execution completes.
         # NOTE: this is threadsafe insofar as no reading occurs until after
         # the thread is join()'d.
         buffer_.append(data)
         # Run our specific buffer & indices through the autoresponder
         self.respond(buffer_, indices)
Пример #11
0
    def io(self, reader, output, buffer_, hide):
        """
        Perform I/O (reading, capturing & writing).

        Specifically:

        * Read bytes from ``reader``, giving it some number of bytes to read at
          a time. (Typically this function is the result of `stdout_reader` or
          `stderr_reader`.)
        * Decode the bytes into a string according to ``self.encoding``
          (typically derived from `default_encoding` or runtime keyword args).
        * Save a copy of the bytes in ``buffer_``, typically a `list`, which
          the caller will expect to be mutated.
        * If ``hide`` is ``False``, write bytes to ``output``, a stream such as
          `sys.stdout`.
        """
        # Inner generator yielding read data
        def get():
            while True:
                data = reader(1000)
                if not data:
                    break
                # Sometimes os.read gives us bytes under Python 3...and
                # sometimes it doesn't. ¯\_(ツ)_/¯
                if not isinstance(data, six.binary_type):
                    # Can't use six.b because that just assumes latin-1 :(
                    data = data.encode(self.encoding)
                yield data
        # Decode stream using our generator & requested encoding
        for data in codecs.iterdecode(get(), self.encoding, errors='replace'):
            if not hide:
                output.write(data)
                output.flush()
            buffer_.append(data)
 def stock(self, s):
     url = 'http://quote.yahoo.com/d/quotes.csv?s=%s&f=l1c1p2d1t1'
     u = urlopen(url % s)
     reader = csv.reader(codecs.iterdecode(u, 'utf-8')) # in python v3.x 'u' returns bytes that needs decoding
     res = reader.__next__() # the first/next item of the iterable
     u.close() # this closes 'u' and 'reader'
     return res
Пример #13
0
def wmo_importer(url='https://raw.githubusercontent.com/flyingeek/editolido/gh-pages/ext-sources/nsd_bbsss.txt'):
    # http://tgftp.nws.noaa.gov/data/nsd_bbsss.txt
    if PY2:
        delimiter = b';'
        data = urlopen(url)
    else:
        delimiter = ';'
        import codecs
        data = codecs.iterdecode(urlopen(url), 'utf-8')
    reader = csv.reader(data, delimiter=delimiter, quoting=csv.QUOTE_NONE)

    def geo_normalize(value):
        # recognize NSEW or undefined (which is interpreted as North)
        orientation = value[-1]
        sign = -1 if orientation in 'SW' else 1
        coords = value if orientation not in 'NEWS' else value[:-1]
        coords += '-0-0'  # ensure missing seconds or minutes are 0
        degrees, minutes, seconds = map(float, coords.split('-', 3)[:3])
        return sign * (degrees + (minutes / 60) + (seconds / 3600))

    not_airport = '----'

    for row in reader:
        name = row[0] + row[1] if row[2] == not_airport else row[2]
        yield name, row[0] + row[1], geo_normalize(row[8]), geo_normalize(row[7])
Пример #14
0
def query_nasdaq(self, exch_name):
    """Query Nasdaq for list of tickers by exchange"""
    header = {'user-agent': 'Mozilla/5.0 '\
              '(Macintosh; Intel Mac OS X 10.9; rv:32.0)'\
              ' Gecko/20100101 Firefox/32.0',}
    url = 'http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0'\
          '&exchange=%s&render=download' % (exch_name)
    req = Request(url, headers = header)
    try:
        response = urlopen(req)
    #Catch errors.
    except URLError as e:
        self.exchange_flag[0] = '1'
        if hasattr(e, 'reason'):
            return e.reason
        elif hasattr(e,'code'):
            return 'Error', e.code
    #Setup list(s) of exchange names.
    exch_result = csv.reader(iterdecode(response,'utf-8'))
    if exch_name == 'nasdaq':
        self.nasdaq_list = [row for row in exch_result]
    elif exch_name == 'nyse':
        self.nyse_list = [row for row in exch_result]
    elif exch_name == 'amex':
        self.amex_list = [row for row in exch_result]
    return 'Unknown Exception in query_nasdaq'
Пример #15
0
 def _add_csv_file_to_db(self, decoder):
     f = codecs.iterdecode(
         self.upload_file_form.cleaned_data['marketing_file'],
         decoder
     )
     reader = csv.reader(f)
     if not self.uploaded_file:
         new_file = UploadedFile(
             filename=self.upload_file_form.cleaned_data['marketing_file'].name,
             uploaded_by=self.request.user,
             num_columns=0,
         )
         new_file.save()
         self.uploaded_file = new_file
     is_first_row = True
     self.num_cols = None
     row_number = 0
     for row in reader:
         if not self.num_cols:
             self.num_cols = len(row)
         if self._csv_row_is_not_blank(row):
             self._add_csv_row_to_db(row, is_first_row, row_number)
         is_first_row = False
         row_number += 1
     if self.num_cols:
         self.uploaded_file.num_columns = self.num_cols
         self.uploaded_file.save()
Пример #16
0
    def buildDruidCache(self,cutoff_druid_score=0.2):
        druid_bz2 = bz2.BZ2File(self.druid_mwe_file, mode='r')
        druid_file = codecs.iterdecode(druid_bz2, 'utf-8')
        num_added_words=0

        for line in druid_file:
            split = line.split(u'\t')
            words = split[1].lower()
            druid_score = split[2]
            has_number = self.RE_D.search(words)
            #exlude any lines that have one or more numbers in them
            if not has_number:
                words_split = [filterHyphens(word) for word in words.split(u' ')]
                float_druid_score = float(druid_score)
                if float_druid_score > cutoff_druid_score:
                    if not any((word in self.stopwords) for word in words_split):
                        self.keyword_dict[words] = float_druid_score
                        num_added_words += 1
                        if num_added_words % 1000 == 0:
                            print words, self.keyword_dict[words]
                else:
                    break
        if self.extra_keywords != '':
            with codecs.open(self.extra_keywords) as infile:
                for line in infile:
                    words = line[:-1].lower()
                    print 'Loading user set keyword:',words
                    self.keyword_dict[words] = 3.0
Пример #17
0
 def test_csv(self):
     reports = make(Report, _quantity=3)
     response = _export(reports, format="csv")
     reader = csv.DictReader(codecs.iterdecode(response, "utf8"))
     rows = list(reader)
     self.assertEqual(3, len(rows))
     self.assertEqual(rows[2]['Description'], reports[2].description)
Пример #18
0
def fixed2csv(f, schema, output=None, **kwargs):
    """
    Convert a fixed-width file to csv using a CSV-formatted schema description.

    A schema CSV must start with a header row with (at least) columns labeled "column","start", and "length". (Other columns will be ignored.) For each subsequent row, therefore, those columns will be used to identify a column name, the starting index of the column (an integer), and the length of the column (also an integer).
    
    Values in the 'start' column are assumed to be zero-based, unless the first value for 'start' is 1, in which case all values are assumed to be one-based.

    If output is specified, rows will be written to that object, otherwise the complete data will be returned.
    """
    streaming = True if output else False

    if not streaming:
        output = StringIO()

    if 'encoding' in kwargs and kwargs['encoding']:
        f = iterdecode(f, kwargs['encoding'])
        
    writer = CSVKitWriter(output)

    reader = FixedWidthReader(f, schema)
    writer.writerows(reader)

    if not streaming:
        data = output.getvalue()
        return data
    
    # Return empty string when streaming
    return ''
Пример #19
0
    def __init__(self, f, schema, encoding=None):
        if encoding is not None:
            f = iterdecode(f, encoding)

        self.file = f
        self.parser = FixedWidthRowParser(schema)
        self.header = True
Пример #20
0
def load_data_for_year(db, y):
	'''Loads data for the year from the CSV file in the current directory into the sqlite DB.'''
	print('Loading data for {y}'.format(y=y))
	
	db.execute('DROP TABLE IF EXISTS ucpay{y}'.format(y=y))
	# Some of these columns probably won't be needed, but we may as well store them.
	db.execute('''
		CREATE TABLE ucpay{y}
		(	ucpay_id INT PRIMARY KEY
		,	year INT
		,	campus TEXT
		,	name TEXT
		,	job_title TEXT
		,	gross_pay NUMERIC
		,	base_pay NUMERIC
		,	overtime_pay NUMERIC
		,	extra_pay NUMERIC
		)
	'''.format(y=y))
	inner_filename = 'ucpay.csv' if y != 2010 else 'ucpay2010.csv'
	datafile = iterdecode(ZipFile('./ucpay{y}.csv.zip'.format(y=y), mode='r').open(inner_filename, mode='r'), 'utf8')
	data = csv.reader(datafile, dialect=(csv.excel_tab if y == 2010 else csv.excel))

	if y != 2009:
		print(next(data))	# skip header
	
	for e in data:
		assert len(e) == len(["ID","year","campus","name","title","gross","base","overtime","extra","exclude"])
		# Skip tuples with "exclude"=="1", which are grad students and temporary employees rather than research professors
		if e[-1] != '1':
			assert e[-1] == '0'
			db.execute('INSERT INTO ucpay{y} VALUES (?,?,?,?,?,?,?,?,?)'.format(y=y), e[:-1])
	db.commit()
	
	print('	done')
Пример #21
0
def populateStep(csvfile):

    reader = csv.DictReader(codecs.iterdecode(csvfile, 'utf-8'), delimiter=',', quotechar='"')
    for row in reader:
        quoi = row['Quoi']
        produit = row['Nom du produit']
        ech = row['echantillon']
        if ech == '':
            ech = 0.0
        else:
            ech = float(ech)
        manip = row['manip']
        if manip == '':
            manip = 0.0
        else:
            manip = float(manip)

        manip_object, created_manip = Manip.objects.get_or_create(name=quoi)
        product, product_created = Product.objects.get_or_create(name=produit)
        step, step_created = StepProto.objects.get_or_create(
            manip=manip_object,
            product=product,
            reac_by_sample=ech,
            reac_by_manip=manip
            )
Пример #22
0
def parse_binary(fd):
    """Detect encoding of binary file fd and yield all chunks, encoded."""

    def find_header():
        rbuf = ReadBuffer(fd)
        parser = parse_encoded(rbuf)
        for msg in parser:
            if msg.msgid == '':
                charset, headers = parse_header_data(msg.msgstrs[0])
                return charset, rbuf.bytelines
        raise PoError('no-header',
                      'No header found in file %s' % getfilename(fd))

    # Non-strict parsing to find header and extract charset:
    charset, lines = find_header()

    parser = parse_encoded(iterdecode(itertools.chain(lines, fd),
                                      encoding=charset))

    # Always yield header first.  We buffer the messsages (again) until
    # we find the header, yield the header, then those in the buffer
    msgs = []

    for msg in parser:
        msgs.append(msg)
        if msg.msgid == '':
            break

    yield msgs.pop()
    for msg in msgs:
        yield msg
    for msg in parser:
        yield msg
Пример #23
0
def scrape(file_obj=None, include_header=True):
    """
    Download the source CSV data from Google Spreadsheets, convert it from
    wide-form to long-form, and output it to a file-like object (stdout by
    default).

    Args:
        file_obj: file-like object in which to output the parsed CSV data
        include_header: if True (default), include a header row in the output
    """
    if file_obj is None:
        file_obj = sys.stdout
    output = csv.writer(file_obj)
    if include_header:
        output.writerow(("date", "pollster", "party", "support"))

    response = urllib.request.urlopen(EXPORT_URL.format(SPREADSHEET_ID))
    # Since urllib returns bytes, iterate through the CSV data and decode it as
    # UTF-8.
    rows = csv.reader(codecs.iterdecode(response, "UTF-8"))
    # Party names are in the first row, from the second column onwards.
    parties = next(rows)[1:]
    for row in rows:
        # Dates are in the first column, from the second row onwards.
        date = datetime.datetime.strptime(row[0], "%d/%m/%Y").date()
        parties_support = []
        for party, support in zip(parties, row[1:]):
            try:
                output.writerow((date, "MMR", party,
                                 float(support.replace(",", "."))))
            except ValueError:
                # No value given for the party, and so no need to output a row.
                pass
Пример #24
0
 def carga(self,file):
     
     with open(file,'rb') as csvfile:
        spamreader = csv.reader(codecs.iterdecode(csvfile,'latin1'), delimiter=';', quotechar='|')
        for row in spamreader:
          if row[0].upper() != "CNPJ" and len(row[0]) > 0:
            self.dados.append(row)
Пример #25
0
def _read_data():
    '''

    '''
    nan = float('NaN')

    data = {}

    with gzip.open(package_path('US_Regions_State_Boundaries.csv.gz')) as f:
        decoded = codecs.iterdecode(f, "utf-8")
        next(decoded)
        reader = csv.reader(decoded, delimiter=str(','), quotechar=str('"'))
        for row in reader:
            region, name, code, geometry, dummy = row
            xml = et.fromstring(geometry)
            lats = []
            lons = []
            for i, poly in enumerate(xml.findall('.//outerBoundaryIs/LinearRing/coordinates')):
                if i > 0:
                    lats.append(nan)
                    lons.append(nan)
                coords = (c.split(',')[:2] for c in poly.text.split())
                lat, lon = list(zip(*[(float(lat), float(lon)) for lon, lat in
                    coords]))
                lats.extend(lat)
                lons.extend(lon)
            data[code] = {
                'name'   : name,
                'region' : region,
                'lats'   : lats,
                'lons'   : lons,
            }

    return data
Пример #26
0
    def build_druid_cache(self, cutoff_druid_score):
        druid_bz2 = bz2.BZ2File(self.druid_mwe_file, mode='r')
        druid_file = codecs.iterdecode(druid_bz2, 'utf-8')
        num_added_words = 0

        logger.info("Loading DRUID cache...")
        start_time = time.time()

        for line in druid_file:
            split = line.split(u'\t')
            words = split[1].lower()
            druid_score = split[2]
            has_number = self.RE_D.search(words)
            # exclude any lines that have one or more numbers in them
            if not has_number:
                words_split = [filter_hyphens(word) for word in words.split(u' ')]
                float_druid_score = float(druid_score)
                if float_druid_score < cutoff_druid_score:
                    break

                if not any((word in self.stopwords) for word in words_split):
                    self.keyword_dict[words] = float_druid_score
                    num_added_words += 1
                    if num_added_words % 1000 == 0:
                        print words, self.keyword_dict[words]

        logger.info("Finished loading DRUID cache. Time needed: " + str(time.time() - start_time))
Пример #27
0
def scrape(file_obj=None, include_header=True):
    """
    Download the source data as a CSV using DataMarket's API, clean it up and
    discard uninteresting rows, and output it to a file-like object (stdout by
    default).

    Args:
        file_obj: file-like object in which to output the parsed CSV data
        include_header: if True (default), include a header row in the output
    """
    if file_obj is None:
        file_obj = sys.stdout
    output = csv.writer(file_obj)
    if include_header:
        output.writerow(("date", "pollster", "party", "support"))

    response = urllib.request.urlopen(SOURCE_URL)
    # Since urllib returns bytes, iterate through the CSV data and decode it as
    # UTF-8.
    rows = csv.reader(codecs.iterdecode(response, "ISO-8859-1"))
    for party, date, support in rows:
        # Only match rows that contain data for distinct political parties.
        # That way we can ignore rows containing data on government support
        # (e.g. "1995-2007 (B og D)").
        match = PARTY_NAME_RE.match(party)
        if match:
            # Dates are only year-and-month. Normalise it to a full date,
            # although that won't actually be an accurate date.
            date = datetime.datetime.strptime(date, "%Y-%m").date()
            # Use the first non-null group in the regex as the party name.
            party = next(g for g in match.groups() if g is not None)
            output.writerow((date, "Gallup", party, float(support)))
Пример #28
0
def open_file(infile, informat='raw', encoding="utf-8", **kwargs):
    logger.debug('Opening file: {}'.format(infile))
    if isinstance(infile, basestring):
        if informat == "vnd.ms-excel" or informat == 'xls':
            import xlrd
            logger.debug('An office file!')
            f = xlrd.open_workbook(infile, on_demand=True)
        elif informat == "xml":
            logger.debug('An XML file!')
            f = etree.parse(infile)
        elif informat == "csv":
            logger.debug('Opening as csv')
            f = csv.reader(open(infile, 'r'),
                           encoding=encoding,
                           **kwargs)
        else:
            f = codecs.open(infile, 'r', encoding)
    else:
        if informat == "vnd.ms-excel" or informat == 'xls':
            import xlrd
            logger.debug('An office file!')
            f = xlrd.open_workbook(file_contents=infile.read(), on_demand=True)
        elif informat == "xml":
            logger.debug('An XML file!')
            f = etree.fromstring(infile)
        elif informat == "csv":
            logger.debug("CSV file")
            f = csv.reader(infile, encoding=encoding, **kwargs)
        else:
            f = codecs.iterdecode(iter(infile.readline, ""), encoding)
    return f
Пример #29
0
def parse_data(fname):
    data = {}

    csvfile = list(csv.reader(codecs.iterdecode(urlopen(fname), 'utf-8')))
    csvfile.pop(0)  # remove 'date;area;concentration;volume' string
    csvfile.pop(len(csvfile) - 1)  # remove empty string
    for row in csvfile:
        row_data = row[0].split(';')
        raw_date = re.match('(.*)T', row_data[0]).group(1)
        date = format_date(raw_date)

        prepare.append_to_data(data, date['year'], date['month'], date['day'], {})

        area = float(row_data[1])
        conc = 0.0
        vol = 0.0
        if len(row_data) > 2:
            conc = float(row_data[2])
            vol = float(row_data[3])

        data[date['year']][date['month']][date['day']]['conc'] = conc
        data[date['year']][date['month']][date['day']]['vol'] = vol
        data[date['year']][date['month']][date['day']]['area'] = area

    return data
 def forex(self, s='usd', t='eur'):
     url = 'http://quote.yahoo.com/d/quotes.csv?s=%s%s=X&f=nl1d1t1'
     u = urlopen(url % (s, t))
     reader = csv.reader(codecs.iterdecode(u, 'utf-8')) # in python v3.x 'u' returns bytes that needs decoding
     res = reader.__next__()  # the first/next item of the iterable
     u.close() # this closes 'u' and 'reader'
     return res
Пример #31
0
def create_canada_population_table(category="VALUE"):
    """Creates a csv file that contains a table for all regions and the selected category (header) for the most recent date

    :param category: Header name in CSV file, defaults to "VALUE"
    :type category: str, optional
    """
    response = urlopen(get_csv_data())
    csv_reader = csv.reader(codecs.iterdecode(response, 'utf-8'),
                            delimiter=',')
    f = open(FILE_PATH + 'canada-population-data.csv', 'w', newline='')
    writer = csv.writer(f)

    line_count = 0
    desired_row = 0
    most_recent_values = {}  # Create a dictionary out of all the regions
    for region in selection_list:
        most_recent_values[region] = 0

    most_recent_date = "0"  # Gets the most recent date in the data

    for row in csv_reader:
        if line_count == 0:
            writer.writerow(['Region', 'Population', 'Date'])
            for index, header in enumerate(row):
                if (header == category):
                    desired_row = index
        elif len(row) > 0 and row[1] in selection_list:  # prename
            most_recent_date = row[0]  # report_date
            most_recent_values[row[1]] = [row[desired_row],
                                          most_recent_date]  # numtotal

        line_count += 1

    for reg in most_recent_values:
        writer.writerow(
            [reg, most_recent_values[reg][0],
             most_recent_values[reg][1]])  # name, total, date

    if (most_recent_date !=
            "0"):  # Update the posts file with the most recent dated update
        update_date(most_recent_date)

    f.close()
Пример #32
0
 def _execute(self, cmd, params=None, data=None, headers={}, method=None):
     """execute a tomcat command and check status returning a file obj for further processing
 
 fobj = _execute(url)
 
 """
     url = self.__managerURL + "/" + cmd
     if params:
         url = url + "?%s" % urllib.parse.urlencode(params)
     req = ExtendedRequest(url, data, headers)
     if method:
         req.method = method
     response = self.__opener.open(req)
     content = codecs.iterdecode(response, "utf-8")
     status = next(content).rstrip()
     self.hasConnected = True
     if not status[:4] == "OK -":
         raise TomcatException(status)
     return content
Пример #33
0
def parser(_, objconf, skip=False, **kwargs):
    """ Parses the pipe content

    Args:
        _ (None): Ignored
        objconf (obj): The pipe configuration (an Objectify instance)
        skip (bool): Don't parse the content

    Returns:
        Iter[dict]: The stream of items

    Examples:
        >>> from meza.fntools import Objectify
        >>> from riko import get_path
        >>> from meza._compat import decode
        >>>
        >>> url = get_path('cnn.html')
        >>> conf = {'url': url, 'start': '<title>', 'end': '</title>'}
        >>> objconf = Objectify(conf)
        >>> kwargs = {'stream': {}, 'assign': 'content'}
        >>> result = parser(None, objconf, **kwargs)
        >>> resp = next(result)['content'][:21]
        >>> decode(resp) == 'CNN.com International'
        True
    """
    if skip:
        stream = kwargs['stream']
    else:
        url = get_abspath(objconf.url)

        with closing(urlopen(url)) as response:
            f = response.fp
            encoding = get_response_encoding(response, 'utf-8')
            decoded = iterdecode(f, encoding)
            sliced = betwix(decoded, objconf.start, objconf.end, True)
            content = '\n'.join(sliced)

        parsed = get_string(content, objconf.start, objconf.end)
        detagged = get_text(parsed) if objconf.detag else parsed
        splits = detagged.split(objconf.token) if objconf.token else [detagged]
        stream = ({kwargs['assign']: chunk} for chunk in splits)

    return stream
Пример #34
0
def load_data(file, group, tsv_file, type='events'):
    """Load downloaded event summary data into PyTables file.

    If you've previously downloaded event summary data from
    http://data.hisparc.nl/ in TSV format, you can load them into a PyTables
    file using this method. The result is equal to directly downloading data
    using :func:`download_data`.

    :param file: the PyTables datafile handler.
    :param group: the PyTables destination group, which need not exist.
    :param tsv_file: path to the tsv file downloaded from the HiSPARC
                     Public Database.
    :param type: the datatype to load, either 'events', 'weather',
                 'singles' or 'lightning'.

    Example::

        >>> import tables
        >>> import sapphire.esd
        >>> data = tables.open_file('data.h5', 'w')
        >>> sapphire.esd.load_data(data, '/s501', 'events-s501-20130910.tsv')

    """
    if type == 'events':
        table = _get_or_create_events_table(file, group)
        read_and_store_class = _read_line_and_store_event_class
    elif type == 'weather':
        table = _get_or_create_weather_table(file, group)
        read_and_store_class = _read_line_and_store_weather_class
    elif type == 'singles':
        table = _get_or_create_singles_table(file, group)
        read_and_store_class = _read_line_and_store_singles_class
    elif type == 'lightning':
        table = _get_or_create_lightning_table(file, group)
        read_and_store_class = _read_line_and_store_lightning_class
    else:
        raise ValueError("Data type not recognized.")

    with open(tsv_file, 'rb') as data:
        reader = csv.reader(iterdecode(data, 'utf-8'), delimiter='\t')
        with read_and_store_class(table) as writer:
            for line in reader:
                writer.store_line(line)
Пример #35
0
def read_data(url, method):

    # Method one: Pandas read_csv() function
    if method == 1:
        print(
            "\nLoading in data via the read_csv method in pandas using a url\n"
        )
        data = pd.read_csv(url, names=columns)

    # Method two: use Numpy's loadtxt() method
    # Make sure to specify the delimiter, otherwise it throws an error
    if method == 2:
        print(
            "\nLoading in data via the loadtxt method in numpy using a url\n")
        data = np.loadtxt(url, dtype=float, delimiter=',')

    # Method three: Load in using csv.reader() function
    # Since the original method can only read physical csvs
    # We need to include the urllib.request.urlopen() method for the url
    # THEN, to properly load the data, we need to create a generator object using codecs
    # Then convert to a list, THEN to a DataFrame in pandas. Having fun yet?
    if method == 3:
        print("\nLoading in data via the csv.reader() method using a url\n")
        response = urllib.request.urlopen(url)
        data = csv.reader(codecs.iterdecode(response, 'utf-8'))
        data = pd.DataFrame(list(data), columns=columns)

    print("\nHere is the shape of the data for Method %s:\n" % (method),
          data.shape)

    # For methods 1 and 3
    try:
        print("\nHere is the head of the data for Method %s:\n" % (method),
              data.head(10))
        print("\nHere is the tail of the data for Method %s:\n" % (method),
              data.tail(10))

    # For method 2
    except AttributeError:
        print("\nHere is the head of the data for Method %s:\n" % (method),
              data[0:9, ])
        print("\nHere is the tail of the data for Method %s:\n" % (method),
              data[0:9, ])
    def check_partial(self, input, partialresults):
        # get a StreamReader for the encoding and feed the bytestring version
        # of input to the reader byte by byte. Read everything available from
        # the StreamReader and check that the results equal the appropriate
        # entries from partialresults.
        q = Queue()
        r = codecs.getreader(self.encoding)(q)
        result = u""
        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
            q.write(c)
            result += r.read()
            self.assertEqual(result, partialresult)
        # check that there's nothing left in the buffers
        self.assertEqual(r.read(), u"")
        self.assertEqual(r.bytebuffer, "")
        self.assertEqual(r.charbuffer, u"")

        # do the check again, this time using a incremental decoder
        d = codecs.getincrementaldecoder(self.encoding)()
        result = u""
        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
            result += d.decode(c)
            self.assertEqual(result, partialresult)
        # check that there's nothing left in the buffers
        self.assertEqual(d.decode("", True), u"")
        self.assertEqual(d.buffer, "")

        # Check whether the rest method works properly
        d.reset()
        result = u""
        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
            result += d.decode(c)
            self.assertEqual(result, partialresult)
        # check that there's nothing left in the buffers
        self.assertEqual(d.decode("", True), u"")
        self.assertEqual(d.buffer, "")

        # check iterdecode()
        encoded = input.encode(self.encoding)
        self.assertEqual(
            input,
            u"".join(codecs.iterdecode(encoded, self.encoding))
        )
Пример #37
0
 def order_csv(self):
     try:
         stream = codecs.iterdecode(self.stream, 'utf-8')
         row_count = 0
         for row in csv.reader(stream, dialect=csv.excel):
             if row_count == 0:
                 row_count += 1
             else:
                 id = row[0]
                 purchase_date = row[1]
                 total_price = row[2]
                 data = OrderCSV(id=int(id),
                                 purchase_date=purchase_date,
                                 total_price=int(total_price))
                 db.session.add(data)
                 db.session.commit()
         return 'CSV data populated into database successfully'
     except Exception:
         return 'Some Error Occurred'
Пример #38
0
    def __init__(self, wb_string, logger_obj=None):
        self.logger = logger_obj
        self.log(u'Initializing a TableauWorkbook object')
        self.wb_string = wb_string
        if self.wb_string.find('.twb') != -1:
            self.log(
                u".twb found in wb_string, assuming it is actually a filename. Opening file"
            )
            fh = open(self.wb_string, 'rb')
            self.wb_string = fh.read()
        self.wb = StringIO(self.wb_string)
        self.start_xml = ""
        self.end_xml = ""
        self.datasources = {}
        start_flag = True
        ds_flag = False
        current_ds = ""

        if self.logger is not None:
            self.enable_logging(self.logger)

        for line in codecs.iterdecode(self.wb, 'utf-8'):
            # Start parsing the datasources
            if start_flag is True and ds_flag is False:
                self.start_xml += line
            if start_flag is False and ds_flag is False:
                self.end_xml += line
            if ds_flag is True:
                current_ds += line
                # Break and load the datasource
                if line.find(u"</datasource>") != -1:
                    self.log(u"Building TableauDatasource object")
                    ds_obj = TableauDatasource(current_ds,
                                               logger_obj=self.logger)
                    self.datasources[ds_obj.get_datasource_name()] = ds_obj
                    current_ds = ""
            if line.find(u"<datasources") != -1 and start_flag is True:
                start_flag = False
                ds_flag = True

            if line.find(u"</datasources>") != -1 and ds_flag is True:
                self.end_xml += line
                ds_flag = False
Пример #39
0
    def execute(self, cache=True):
        '''Retrieve a world bank indicator and convert to a data package.

        Data Package is stored at ./indicators/{indicator-name}
        '''
        if cache:
            self.retrieve()
            (meta, data) = self.extract(open(self.meta_dest),
                                        open(self.data_dest))
        else:
            (meta, data) = self.extract(
                urllib.request.urlopen(self.meta_url),
                codecs.iterdecode(urllib.request.urlopen(self.data_url),
                                  'utf-8'))

        basepath = os.path.join('indicators', meta['name'])
        os.makedirs(basepath, exist_ok=True)
        self.datapackage(meta, data, basepath)
        return basepath
Пример #40
0
def get_aadf_by_direction_data(local_authority_id):
    """
    Get a DictReader of the specified AADF By Direction dataset.
    """

    # URL likely to change now and again. No guarantee the URL will remain
    # easily constructable, so not much point moving to config.
    url = f"https://dft-statistics.s3.amazonaws.com/road-traffic/downloads/aadfbydirection/local_authority_id/dft_aadfbydirection_local_authority_id_{local_authority_id}.csv"

    # Deliberately not handling any errors here. Let it crash and inspect
    # manually.
    csv_stream = urlopen(url)

    # CSV files tend to be a few MB, so use a generator (codecs.iterdecode) to
    # stream the CSV data and make the read process a bit more memory
    # efficient.
    csv_file = csv.DictReader(codecs.iterdecode(csv_stream, "utf-8"))

    return csv_file
Пример #41
0
def create_preferences(file):
    shipback = []
    r = DictReader(codecs.iterdecode(file, 'utf-8'))
    for row in r:

        obj_1_id = -1
        if row['object_1_content_type'] == 'course':
            obj_1_id = Course.objects.get(name=row['object_1_natural_id']).id
        elif row['object_1_content_type'] == 'teacher':
            obj_1_id = BaseUser.objects.get(
                email=row['object_1_natural_id']).id
            row['object_1_content_type'] = 'baseuser'  # so we get the appropriate content type later
        elif row['object_1_content_type'] == 'timeblock':
            obj_1_id = Timeblock.objects.get(
                block_id=row['object_1_natural_id']).id
        elif row['object_1_content_type'] == 'section':
            obj_1_id = Section.objects.get(
                section_id=row['object_1_natural_id']).id

        obj_2_id = -1
        if row['object_2_content_type'] == 'course':
            obj_2_id = Course.objects.get(name=row['object_2_natural_id']).id
        elif row['object_2_content_type'] == 'teacher':
            obj_2_id = BaseUser.objects.get(
                email=row['object_2_natural_id']).id
            row['object_2_content_type'] = 'baseuser'  # so we get the appropriate content type later
        elif row['object_2_content_type'] == 'timeblock':
            obj_2_id = Timeblock.objects.get(
                block_id=row['object_2_natural_id']).id
        elif row['object_2_content_type'] == 'section':
            obj_2_id = Section.objects.get(
                section_id=row['object_2_natural_id']).id

        shipback.append(
            Preference(weight=row["weight"],
                       object_1_content_type=ContentType.objects.filter(
                           model=row['object_1_content_type'])[0],
                       object_2_content_type=ContentType.objects.filter(
                           model=row['object_2_content_type'])[0],
                       object_1_id=obj_1_id,
                       object_2_id=obj_2_id))
    return shipback
Пример #42
0
def get_canton_data(canton='ZH'):
    # This CSV is messy. There are very many missing fields, because each canton
    # reports different data, and each new row is populated in increments (eg.
    # they might know how many are in quarantine that day in the morning, but
    # might not take hospital counts until the evening, etc.).
    csv_url = f'https://github.com/openZH/covid_19/raw/master/fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_Kanton_{canton}_total.csv'
    csv_lines = requests.get(csv_url).iter_lines()
    reader = csv.reader(codecs.iterdecode(csv_lines, 'utf-8'), delimiter=',')
    # Columns of CSV:
    # [0: date, 1: time, 2: abbreviation_canton_and_fl, 3: ncumul_tested,
    # 4: ncumul_conf, 5: new_hosp, 6: current_hosp, 7: current_icu,
    # 8: current_vent, 9: ncumul_released, 10: ncumul_deceased, 11: source,
    # 12: current_isolated, 13: current_quarantined,
    # 14: current_quarantined_riskareatravel, 15: current_quarantined_total]

    # Rows N-2, N-1
    last_rows = (None, None)
    for i, row in enumerate(reader):
        last_rows = (last_rows[-1], row)
        # Skip the first few rows to avoid going None in last_rows,
        # and to skip the header row.
        if i < 3:
            continue
        # Make sure all the rows we need are present, as they are not all
        # updated at the same time.
        if any(row[i] == '' for i in [0, 4, 11]):
            continue
        try:
            num_new_cases = int(last_rows[-1][4]) - int(last_rows[-2][4])
            num_isolated = last_rows[-1][12]
            num_isolated = num_isolated if num_isolated != '' else '(idk)'
            num_quarantined = last_rows[-1][13]
            num_quarantined = num_quarantined if num_quarantined != '' else '(idk)'
            date = last_rows[-1][0]
            source = last_rows[-1][11]
        except Exception as e:
            # We don't really care if it failed because a row was missing or w/e.
            pass

    return (f'On {date}, there were {num_new_cases} new cases reported in ' +
            f'Canton {canton}, with {num_isolated} in isolation and ' +
            f'{num_quarantined} in quarantine. Source: {source}')
def _read_wiki_data(corpus_filename):
    """Reads the data from the compressed Wikipedia file into memory

    In an effort to cut down on runtime, only the first 1200000 bytes are read into memory. This is a high estimate
    of the amount of data we want. A later step refines this number
    
    This function also counts the number of occurances of each character. Any character which appears less than 0.05% 
    of the time is removed from the data

    :return: The raw data from disk
    """
    import tarfile
    with tarfile.open(corpus_filename, 'r:xz') as tar_file:
        raw_data = ''
        for member in tar_file.getmembers():
            _log.info('Reading from file %s' % member.name)
            member_stream = tar_file.extractfile(member)

            count = 0
            binary_chunks = iter(functools.partial(member_stream.read, 1), "")
            for unicode_chunk in codecs.iterdecode(binary_chunks, 'utf-8'):
                raw_data += unicode_chunk
                count += 1
                if count % 10000 == 0:
                    _log.info('Read in %s characters' % count)
                # 32K words * 10 characters per word = 320000 characters total
                # This is a super high estimate, but all well.
                if count >= 320000:
                    break

    #character_frequencies = defaultdict(int)
    #character_increment = 1.0 / len(raw_data)
    #for char in raw_data:
    #    character_frequencies[char.lower()] += character_increment
    #_log.info('Counted occurrences of each character')

    #data_filtered = [char.lower() for char in raw_data if character_frequencies[char.lower()] > 0.005]
    #_log.info('Filtered out uncommon characters')

    #return ''.join(data_filtered)

    return raw_data
Пример #44
0
    def merge_namespaces():
        """Serves the page for merging bel namespaces"""
        form = MergeNamespaceForm()

        if not form.validate_on_submit():
            return render_template('merge_namespaces.html', form=form)

        log.warning(form.file)

        files = request.files.getlist("file")

        names = set()

        for file in files:
            log.warning('file: %s', file)
            resource = parse_bel_resource(codecs.iterdecode(file, 'utf-8'))
            names |= set(resource['Values'])

        si = StringIO()

        write_namespace(
            namespace_name=form.name.data,
            namespace_keyword=form.keyword.data,
            namespace_species=form.species.data,
            namespace_description=form.description.data,
            author_name=current_user.name,
            author_contact=current_user.email,
            citation_name=form.citation.data,
            citation_description=
            'This namespace was created by the PyBEL Web namespace merge service',
            namespace_domain=form.domain.data,
            author_copyright=form.licenses.data,
            values=names,
            cacheable=False,
            file=si)

        output = make_response(si.getvalue())
        output.headers[
            "Content-Disposition"] = "attachment; filename={}.belns".format(
                form.keyword.data)
        output.headers["Content-type"] = "text/plain"
        return output
Пример #45
0
def readBase(csvFile):
    labels = []
    base = []
    listaPorEmocao = {'anger':"", 'boredom':"", 'empty':"", 'enthusiasm':"", 'fun':"", 'happiness':"",
                      'hate':"", 'love':"", 'neutral':"", 'relief':"", 'sadness':"", 'sentiment':"", 'surprise':"", 'worry':""}
    qtdEmocao = {'anger': 0, 'boredom': 0, 'empty': 0, 'enthusiasm': 0, 'fun': 0, 'happiness': 0,
                      'hate': 0, 'love': 0, 'neutral': 0, 'relief': 0, 'sadness': 0, 'sentiment': 0,
                      'surprise': 0, 'worry': 0}
    with open(csvFile) as csvfile:
        import codecs
        ifile = open(csvFile, "rb")
        read = csv.reader(codecs.iterdecode(ifile, 'utf-8'))

        for row in read:
            try:
                temp2 = str(row[0])
                labels.append(temp2)

                temp1 = str(row[1])

                temp1 = temp1.split()
                temp3 = []
                #aplica steammer
                for u in range(len(temp1)):
                    #temp1[u] = stemmer.stem(temp1[u])
                    temp1[u] = lemmatizer.lemmatize(temp1[u])
                    if(temp1[u] in stopwords):
                        temp3.append(temp1[u])
                temp1 = " ".join(temp1)
                #remove stopwords
                if(len(temp3)>0):
                    for u in temp3:
                        temp1 = temp1.replace(u,"")

                # filtering
                temp1 = re.sub('[^A-Za-z]+', ' ', temp1)
                base.append(temp1)
                listaPorEmocao[temp2] = listaPorEmocao[temp2] + " " + temp1
                qtdEmocao[temp2] = qtdEmocao[temp2] + 1
            except IndexError:
                pass
    return base, labels, listaPorEmocao, qtdEmocao
Пример #46
0
def read_doc_annotations(archive_file, force_redownload=False, pos_type='DOCUMENT_PNEUMONIA_YES'):
    print('Reading annotations from file : ' + archive_file)

    if 'http' in archive_file:
        if force_redownload or not os.path.isfile(archive_file):
            print('Downloading remote file : ' + archive_file)
            urllib.request.urlretrieve(archive_file, archive_file)
            filename = archive_file.split('/')[-1]
    else:
        filename = archive_file

    annotated_doc_map = {}

    print('Opening local file : ' + filename)
    z = zipfile.ZipFile(filename, "r")
    zinfo = z.namelist()
    for name in zinfo:
        if name.endswith('.txt') or name.endswith('.ann'):
            basename = name.split('.')[0].split('/')[-1]
            if basename not in annotated_doc_map:
                annotated_doc_map[basename] = AnnotatedDocument()
            anno_doc = annotated_doc_map[basename]
            # handle text and BRAT annotation files (.ann) differently
            if name.endswith('.txt'):
                with z.open(name) as f1:
                    anno_doc.text = f1.read().decode('utf8')
            else:
                with z.open(name) as f1:
                    # handle this as utf8 or we get back byte arrays
                    # print(name)
                    anno_doc.annotations = read_brat_annotations(codecs.iterdecode(f1, 'utf8'))

    # now let's finally assign a 0 or 1 to each document based on whether we see our expected type for the pneumonia label
    for key, anno_doc in annotated_doc_map.items():
        annos = anno_doc.annotations
        anno_doc.positive_label = 0
        for anno in annos:
            # NOTE : This "positive_label" relates to positive/possible cases of pneumonia
            if anno.type == pos_type:
                anno_doc.positive_label = 1

    return annotated_doc_map
Пример #47
0
def load_votes_from_stream(stream, filename):
    res = {}
    rd = []
    if filename.endswith(".csv"):
        if isinstance(stream, io.BytesIO):
            stream = codecs.iterdecode(stream, 'utf-8')
        for row in csv.reader(stream, skipinitialspace=True):
            rd.append(row)
    elif filename.endswith(".xlsx"):
        book = openpyxl.load_workbook(stream)
        sheet = book.active
        for row in sheet.rows:
            rd.append([cell.value for cell in row])
    else:
        return None, None, None

    res["constituencies"] = [row[0] for row in rd[1:]]
    for row in rd:
        del (row[0])

    if rd[0][0].lower() == "cons":
        res["constituency_seats"] = [
            int(row[0]) if row[0] else 0 for row in rd[1:]
        ]
        for row in rd:
            del (row[0])

    if rd[0][0].lower() == "adj":
        res["constituency_adjustment_seats"] = [
            int(row[0]) if row[0] else 0 for row in rd[1:]
        ]
        for row in rd:
            del (row[0])

    num_parties = 0
    while (num_parties < len(rd[0]) and rd[0][num_parties]):
        num_parties += 1
    res["parties"] = rd[0][:num_parties]
    res["votes"] = [[int(v) if v else 0 for v in row[:num_parties]]
                    for row in rd[1:]]

    return res
Пример #48
0
def getrates(archivo):
    """Procedure to update change rates"""
    rates = {}
    url = "http://www.bankofcanada.ca/en/markets/csv/exchange_eng.csv"
    fh = urllib.request.urlopen(url)
    data = csv.reader(codecs.iterdecode(fh, "utf-8"))
    for row in data:
        if row[0].startswith("Date "):
            date = row[-1]
        elif not row[0].startswith("#"):
            value = float(row[-1])
            rates[row[1][1:].replace("_NOON", "").lower()] = value

    del rates["iexe0124"]
    del rates["iexe0125"]
    rates["cad"] = 1.
    for rate in rates:
        rates[rate] = rates[rate] / rates["usd"]
    rates["date"] = date
    json.dump(rates, open(archivo, "w"), indent=4)
Пример #49
0
def download_products(db, catalog, token, CURRENT_VERSION):

    session = requests.Session()

    with closing(
            session.get(catalog["CatalogCSVUrl"],
                        stream=True,
                        cookies={'grs': token})) as r:
        reader = csv.reader(codecs.iterdecode(r.iter_lines(),
                                              encoding='utf-8'))
        line_count = 0
        for row in reader:
            # print(row)
            if line_count == 0:
                line_count += 1
            elif row != [] and len(row) > 1 and line_count > 0:
                add_product(db, catalog, row, CURRENT_VERSION)
                line_count += 1
            else:
                print(row)
Пример #50
0
def requests_get(url, result_type='text'):
    """
    :param url: url to GET
    :param result_type: text (default), json, or csv
    """
    logger.debug(f'GET {url}')

    response = requests.get(url, allow_redirects=True)
    if response.status_code != 200:
        message = f"GET {url}: HTTP {response.status_code}: {response.text}"
        logger.error(message)
        raise requests.exceptions.HTTPError(message)
    if result_type == 'json':
        return response.json()
    if result_type == 'csv':
        reader = csv.DictReader(
            codecs.iterdecode(response.iter_lines(), 'utf-8'))
        # returns list of dicts
        return list(reader)
    return response.text
Пример #51
0
def test_refresh():
    """
    Test token expiration and refresh.
    """
    test_client = make_test_app().test_client()

    with patch('time.time', Mock(return_value=time.time())) as time_1:
        # authenticate and get an ID token cookie
        auth_redirect = test_client.get('/')
        callback_redirect = test_client.get(callback_url_for(auth_redirect))
        actual_page = test_client.get(callback_redirect.headers['Location'])
        page_text = ''.join(codecs.iterdecode(actual_page.response, 'utf-8'))
        assert page_text == 'too many secrets', "Authentication failed"

    # app should now try to use the refresh token
    with patch('time.time', Mock(return_value=time.time() + 10)) as time_2:
        test_client.get('/')
        body = parse_qs(last_request['body'])
        assert body.get('refresh_token') == ['mock_refresh_token'], \
            "App should have tried to refresh credentials"
def insert_into_table(url):
    data = urllib.request.urlopen(url)
    response = data.read()
    encoding = data.headers.get_content_charset('utf-8')
    data_string = response.decode(encoding)
    dialect = csv.Sniffer().sniff(data_string[0:101])
    data = urllib.request.urlopen(url)
    reader = csv.reader(codecs.iterdecode(data, 'utf-8'),
                        delimiter=dialect.delimiter)
    iterator = 0
    year = year_extraction(url)
    sqlQuery = "INSERT INTO masini (judet, categorie_nationala, categorie_comunitara, marca, descriere_comerciala, total, an) VALUES (%s, %s, %s, %s, %s, %s, %s)"
    for record in reader:
        if iterator > 0:
            mycursor.execute(
                sqlQuery,
                (record[0], record[1], record[2], text_extract(
                    record[3]), record[4], record[5], year))
        iterator = iterator + 1
    mydb.commit()
Пример #53
0
def verify_encode(file_obj, encoding, blocks=1, chunk_size=4096):
    """
    Iterate through the file chunking the data into blocks and decoding them.

    Here we can adjust how the size of blocks and how many to validate. By default,
    we are just going to check the first 4K block.
    """

    good = True
    file_obj.seek(0)
    binary_chunks = iter(functools.partial(file_obj.read, chunk_size), b"")
    try:
        for unicode_chunk in codecs.iterdecode(binary_chunks, encoding):  # noqa
            if blocks:
                blocks -= 1
            else:
                break
    except Exception:
        good = False
    return good
Пример #54
0
 def _get_job_version(self):
     for record in self:
         if record.archive:
             with record._get_zipfile() as zf:
                 filename = 'jobInfo.properties'
                 # INFO: can't use configparser because this file
                 # has no section
                 with zf.open(filename) as f:
                     reader = csv.reader(codecs.iterdecode(
                         f.readlines(), 'utf-8'),
                                         delimiter='=',
                                         escapechar='\\',
                                         quoting=csv.QUOTE_NONE)
                     for row in reader:
                         if row[0] == 'jobVersion':
                             record.version = row[1]
         else:
             record.version = max(record._get_all_children().filtered(
                 'version').mapped('version'),
                                  default='')
Пример #55
0
def import_csv():
    """
    Imports a CSV and returns a dictionary of key: value pairs.

    Inputs
    ------
    filename (str): name of the CSV file. This should be pipe (`|`) delimited
        and include only the Primary Key (in string format) and the JSON data
        you want to add in the new column
    """
    id_to_json = {}

    url = "https://cockroach-university-public.s3.amazonaws.com/10000row_json_column.csv"
    ftpstream = urllib.request.urlopen(url)
    csvfile = csv.reader(codecs.iterdecode(ftpstream, 'utf-8'), delimiter='|')

    for row in csvfile:
        id_to_json[row[0]] = row[1]
        
    return id_to_json
Пример #56
0
def parse_csv_file(fp_or_filename,
                   has_title=False,
                   has_header=False,
                   encoding='utf-8'):
    fp = fp_or_filename
    if isinstance(fp, str):
        fp = open(fp, newline='', encoding=encoding)
    else:
        fp = codecs.iterdecode(fp, encoding)
    reader = csv.reader(fp, delimiter=',')
    data = []
    try:
        title = next(reader) if has_title else None
        header = next(reader) if has_header else None
        for row in reader:
            data.append(row)
    except csv.Error as e:
        _logger.error('CSV Loading error!')
        return str(e)
    return data
    def test_run(self, fetcher, response, file_num, *args, **kwargs):
        # validate that the decorator is working as intended. should not
        # provide a response object, since it's the objective of the test
        assert response is None
        # validate settings
        assert fetcher.settings.start_date == utils.get_expected_start_date()

        expected_reader = utils.get_expected_data_files_as_csv(
            finra.source, file_num)

        for response in fetcher.run(show_progress=False):
            assert response is not None
            assert response.status_code == 200

            reader = csv.reader(
                codecs.iterdecode(response.iter_lines(),
                                  'utf-8',
                                  errors="replace"))
            for row in reader:
                assert row == next(expected_reader)
Пример #58
0
def main():
    url = "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv"
    stream = urlopen(url)
    csv_file = csv.reader(codecs.iterdecode(stream, 'utf-8'))
    iris_lst = []
    [iris_lst.append(ExtendedList(raw).lst) for raw in csv_file]

    for item in iris_lst:
        iris_value_list = []
        converted_iris_lst = ExtendedList.next_val(item)
        for element in converted_iris_lst:
            iris_value_list.append(element)
        if len(iris_value_list) == 0:
            continue
        else:
            iris_value_list = tuple(iris_value_list)

        insert_iris_list(iris_value_list)

    print('Done!')
Пример #59
0
def read_whois(npc_name='list'):
	response = {}
	response['text'] = ''
	npcs = {}
	url = config['WHOIS_CSV']
	with closing(requests.get(url, stream=True)) as r:
		reader = csv.reader(codecs.iterdecode(r.iter_lines(), 'utf-8'), delimiter = ',', quotechar='"')
		for row in reader:
			name = row[0]
			description = row[1]
			npcs[name] = description
		if npc_name.lower() == 'list':
			response['title'] = 'List of NPCs'
			for npc_name in list(npcs.keys()):
				response['text'] = response['text'] + '\n' + npc_name
			return response
		search_match = process.extractOne(npc_name, list(npcs.keys()))
		response['title'] = search_match[0]
		response['text'] = npcs[search_match[0]]
		return response
Пример #60
0
def read_text_resource(finput, encoding='utf-8', ignore_prefix='#'):
    """Read a text resource ignoring comments beginning with pound sign
    :param finput: path or file handle
    :type finput: str, file
    :param encoding: which encoding to use (default: UTF-8)
    :type encoding: str
    :param ignore_prefix: lines matching this prefix will be skipped
    :type ignore_prefix: str, unicode
    :rtype: generator
    """
    ctx = joint_context(codecs.iterdecode(finput, encoding=encoding)) \
        if isiterable(finput) \
        else codecs.open(finput, 'r', encoding=encoding)
    with ctx as fhandle:
        for line in fhandle:
            if ignore_prefix is not None:
                line = line.split(ignore_prefix)[0]
            line = line.strip()
            if line:
                yield line