def ingest_from_directory(self, path): """ Ingests NSRL CSV files from the passed directory path which have been extracted from extracted from a NSRL ISO image. This includes reading from the compressed csv of the NSRLFile.txt.zip file. :param str path: path to directory containing files to ingest """ filenames = os.listdir(path) fmap = self._get_dir_ingest_filenames(filenames) # Ingest mfg, os and prod (in that order). for label, key, meth in [("mfg", "NSRLMfg.txt", "put_manufacturers"), ("os", "NSRLOs.txt", "put_oss"), ("prod", "NSRLProd.txt", "put_products")]: self.print("Inserting %s info..." % label, end=" ") s = time.time() with open(os.path.join(path, fmap[key]), "rb") as fh: reader = csv.reader(binfile_utf8_readlines(fh)) res = getattr(self.client, meth)(reader) e = time.time() self.print("done! Put %d in %fs" % (len(res), e - s)) # Finally, ingest file information. self.print("Inserting file info...") s = time.time() with zipfile.ZipFile(os.path.join(path, fmap["NSRLFile.txt.zip"])) as zf: reader = csv.reader( _zipped_file_readlines(zf, "NSRLFile.txt", skip_first=True)) count = self.client.put_files(reader, verbose=self._verbose) e = time.time() self.print("File ingest done! Put %d in %fs" % (count, e - s))
def process_item(self, item, spider): isin = spider.isin reader = csv.reader(io.StringIO(item['csv']), delimiter=';') result = [] next(reader) # Skip header for line in reader: if not line: continue day = line[0] value = line[4] result.append((day, value)) min_time = result[0][0] max_time = result[-1][0] output_filename = self.save_file.format(isin, min_time, max_time) with io.open(output_filename, 'w', newline='', encoding='UTF-8') as output: writer = csv.writer(output, delimiter=';') writer.writerow(['Datum', 'Kurs']) for t in result: writer.writerow([t[0], t[1]])
def parse(data, path, project, id=None): desc = basic_description(data, 'stratification', path) desc['idtype'] = data.get('idtype', data.get('rowtype', 'unknown')) for k, v in data.items(): if k not in desc: desc[k] = v if id is not None: desc['id'] = id if 'size0' in data and 'ngroups' in data: desc['size'] = [int(data['size0'])] del desc['size0'] desc['ngroups'] = int(data['ngroups']) else: # derive from the data clusters = set() count = 0 with io.open(path, 'r', newline='', encoding=desc.get('encoding', 'utf-8')) as csvfile: reader = csv.reader(csvfile, delimiter=desc.get('separator', u','), quotechar=str(desc.get('quotechar', u'|'))) for row in reader: count += 1 clusters.add(row[1]) desc['size'] = [count] desc['ngroups'] = len(clusters) return CSVStratification(desc, project)
def check_duplicated_columns(location): """ Return a list of errors for duplicated column names in a CSV file at location. """ location = add_unc(location) # FIXME: why ignore errors? with codecs.open(location, 'rb', encoding='utf-8', errors='ignore') as csvfile: reader = csv.reader(csvfile) columns = next(reader) columns = [col for col in columns] seen = set() dupes = OrderedDict() for col in columns: c = col.lower() if c in seen: if c in dupes: dupes[c].append(col) else: dupes[c] = [col] seen.add(c.lower()) errors = [] if dupes: dup_msg = [] for name, names in dupes.items(): names = u', '.join(names) msg = '%(name)s with %(names)s' % locals() dup_msg.append(msg) dup_msg = u', '.join(dup_msg) msg = ('Duplicated column name(s): %(dup_msg)s\n' % locals() + 'Please correct the input and re-run.') errors.append(Error(ERROR, msg)) return errors
def _read_data(self, pokemon): try: pokemon = pokemon.replace("(", "").replace(")", "").replace(" ", "_").replace("'", "").lower().title() with open('/data/%s_%s.csv' % (pokemon.lower(), self.cup.lower()), encoding='ASCII') as f: logging.info("Running simulation for %s" % pokemon) result = [row for row in csv.reader(f, delimiter=',')] result.pop(0) sorted_result = sorted(result, key=lambda x: x[1])[::-1] challenger = pokemon.lower().title() self.node_data.append({"challenger": challenger, 'challenger_id': stats[challenger]}) hasher = hashlib.md5() hasher.update(challenger.encode('utf-8')) color = "#" + hasher.hexdigest()[:6] logging.info('Adding %s to list of pokemons' % challenger) for entry in sorted_result: opponent = entry[0].lower().title().replace("(", "").replace(")", "").replace(" ", "_").replace("'", "").lower().title() if opponent not in stats: logging.info("%s was not found in stats table" % opponent) continue if int(entry[2]) > 0: self.edge_data.append({ 'challenger': challenger, 'challenger_id': stats[challenger], 'opponent': opponent, 'opponent_id': stats[opponent], 'weight': int(entry[2])/100, 'color': color}) mc.set(self.url, json.dumps({'edge': self.edge_data, 'node': self.node_data}), 86400) return True except Exception as e: traceback.print_exc() logging.info(e)
def readerAssertEqual(self, input, expected_result): with TemporaryFile("w+", newline='') as fileobj: fileobj.write(input) fileobj.seek(0) reader = csv.reader(fileobj, dialect=self.dialect) fields = list(reader) self.assertEqual(fields, expected_result)
def import_schedule(self): """ Import in a Dict the schedule tweets information, using as key the tweet ID """ count = 0 DictSched = {} with io.open(self.schedule_file, newline='', encoding='utf-8') as inf: _reader = csv.reader(inf) next(_reader) # Iterate over Schedule Tweets for row in _reader: sch_tweet_id, created_at, text = row[:3] if len(row) > 3: entities = row[3:] if self.limit and count == self.limit: break if count == 0: logging.info("Processing Schedule Tweets...") elif count % 1000 == 0: logging.info("Processed %d Schedule Tweets", count) count += 1 DictSched[sch_tweet_id] = {} DictSched[sch_tweet_id]['text'] = text DictSched[sch_tweet_id]['entities'] = ( self.extract_schedule_entities(text, entities)) DictSched[sch_tweet_id]['created_at'] = parse(created_at) logging.info("Done!") return DictSched
def writeResults(clustered_dupes, input_file, output_file): # Write our original data back out to a CSV with a new column called # 'Cluster ID' which indicates which records refer to each other. logging.info('saving results to: %s' % output_file) cluster_membership = {} for cluster_id, (cluster, score) in enumerate(clustered_dupes): for record_id in cluster: cluster_membership[record_id] = cluster_id unique_record_id = cluster_id + 1 writer = csv.writer(output_file) reader = csv.reader(StringIO(input_file)) heading_row = next(reader) heading_row.insert(0, u'Cluster ID') writer.writerow(heading_row) for row_id, row in enumerate(reader): if row_id in cluster_membership: cluster_id = cluster_membership[row_id] else: cluster_id = unique_record_id unique_record_id += 1 row.insert(0, cluster_id) writer.writerow(row)
def from_csv(table): filepath = os.path.join(dirpath, table.name + ".csv") with io.open(filepath, mode="r", encoding="utf-8") as csv_file: csv_reader = csv.reader(csv_file) header = next(csv_reader) conn = self.db.connect() with self.lock, conn.begin(): for row in csv_reader: values = dict(zip(header, row)) if self.engine_dialect_is(self.DIALECT_SQLITE): identifier = values[table.c.id.name] # try to update entry stmt = update(table).values(values).where( table.c.id == identifier) if conn.execute(stmt).rowcount == 0: # identifier doesn't match any => insert new entry stmt = insert(table).values(values) conn.execute(stmt) elif self.engine_dialect_is(self.DIALECT_POSTGRESQL): stmt = pg_insert(table).values(values)\ .on_conflict_do_update(index_elements=[table.c.id], set_=values) conn.execute(stmt) if self.engine_dialect_is(self.DIALECT_POSTGRESQL): # update sequence sql = "SELECT setval('{table}_id_seq', max(id)) FROM {table}".format( table=table.name) conn.execute(text(sql)) conn.close()
def handle(self, *args, **options): print os.path.basename(options['filename']) os.chdir(os.path.dirname(options['filename'])) filename = os.path.basename(options['filename']) reader = csv.reader(io.open(filename, encoding="utf-8"), delimiter=',') reader.next() for i, row in enumerate(reader): if not row[11]: continue if row[1] == '名誉会長': rank = '00' elif row[1] == '会長': rank = '01' elif row[1] == '部会長': rank = '02' elif row[1] == '常任理事': rank = '03' elif row[1] == '監事': rank = '04' else: rank = '05' member = Member.objects.get(name=row[2], popular_name=row[3]) member.rank = rank member.save()
def test_unicode_read(self): import io with TemporaryFile("w+", newline='', encoding="utf-8") as fileobj: fileobj.write(",".join(self.names) + "\r\n") fileobj.seek(0) reader = csv.reader(fileobj) self.assertEqual(list(reader), [self.names])
def from_csv(table): filepath = os.path.join(dirpath, table.name + ".csv") with io.open(filepath, mode="r", encoding="utf-8") as csv_file: csv_reader = csv.reader(csv_file) header = next(csv_reader) with self.lock, self.conn.begin(): for row in csv_reader: values = dict(zip(header, row)) if self.engine.dialect.name == self.DIALECT_SQLITE: identifier = values[table.c.id] # try to update entry stmt = update(table).values(values).where( table.c.id == identifier) if self.conn.execute(stmt).rowcount == 0: # identifier doesn't match any => insert new entry stmt = insert(table).values(values) self.conn.execute(stmt) elif self.engine.dialect.name == self.DIALECT_POSTGRESQL: stmt = pg_insert(table).values(values)\ .on_conflict_do_update(index_elements=[table.c.id], set_=values) self.conn.execute(stmt) if self.DIALECT_POSTGRESQL == self.engine.dialect.name: # update sequences self.conn.execute( text( "SELECT setval('profiles_id_seq', max(id)) FROM profiles" )) self.conn.execute( text( "SELECT setval('spools_id_seq', max(id)) FROM spools" ))
def label(module, infile, outfile, xml, type=None): training_data = data_prep_utils.TrainingData(xml, module) reader = csv.reader(infile) strings = set(row[0] for row in reader) labels = module.LABELS if type is None: tagger = module.TAGGER else: tagger = module.TAGGERS[type] or module.TAGGER if tagger: labeled_list, raw_strings_left = consoleLabel(strings, labels, module, type) else: labeled_list, raw_strings_left = naiveConsoleLabel( strings, labels, module) training_data.extend(labeled_list) with open(outfile, 'wb'): training_data.write(outfile) file_slug = os.path.basename(infile.name) if not file_slug.startswith('unlabeled_'): file_slug = 'unlabeled_' + file_slug remainder_file = os.path.dirname(infile.name) + file_slug data_prep_utils.list2file(raw_strings_left, remainder_file)
def load(self): if self._loaded is not None: return self._loaded data = [] with io.open(self._path, 'r', newline='', encoding=self._desc.get('encoding', 'utf-8')) as csvfile: reader = csv.reader(csvfile, delimiter=self._desc.get('separator', u','), quotechar=str(self._desc.get( 'quotechar', u'|'))) data.extend(reader) # print data def to_num(s): try: return float(s) # for int, long and float except ValueError: return s header = data[0] data = [[to_num(v) if i > 0 else v for i, v in enumerate(row)] for row in data[1:]] data.insert(0, header) # convert to col, row and data self._loaded = self._process(data) return self._loaded
def open(self): # Cheat the parent open self.encoding = 'UTF-8' f = Source.open(self) csvreader = csv.reader(f, delimiter=u';') structureet = [ u'nofinesset,nofinessej,rs,rslongue,complrs,compldistrib,numvoie,typvoie,voie,compvoie,lieuditbp,commune,departement,libdepartement,ligneacheminement,telephone,telecopie,categetab,libcategetab,categagretab,libcategagretab,siret,codeape,codemft,libmft,codesph,libsph,dateouv,dateautor,datemaj,numuai,coordxet,coordyet,sourcecoordet,datemajcoord' .split(',') ] geolocalisation = {} for row in csvreader: if row[0] == 'structureet': structureet.append(row[1:]) elif row[0] == 'geolocalisation': geolocalisation[row[1]] = row[2:] for row in structureet: row += geolocalisation.get(row[0], []) csvfile = io.StringIO() writer = csv.writer(csvfile) for row in structureet: writer.writerow(row) csvfile.seek(0) return csvfile
def import_gazetters(self): """ Create a set for each gazeetter defined in the config file """ logging.info('Importing Gazeetters...') firstnames = set() lastnames = set() instrtypes = set() voicetypes = set() worktypes = set() notes = set() modes = set() opus = set() numbers = set() gazzetters = [(self.cfg_feat['FIRST_NAMES_GAZ'], firstnames), (self.cfg_feat['LAST_NAMES_GAZ'], lastnames), (self.cfg_feat['CONTR_TYPES_GAZ'], voicetypes), (self.cfg_feat['INTRUMENT_TYPES_GAZ'], instrtypes), (self.cfg_feat['WORK_TYPES_GAZ'], worktypes), (self.cfg_feat['NOTES_GAZ'], notes), (self.cfg_feat['MODES_GAZ'], modes), (self.cfg_feat['OPUS_GAZ'], opus), (self.cfg_feat['NUMBER_GAZ'], numbers)] for g_file, g_set in gazzetters: with io.open(g_file, newline='', encoding='utf-8') as csvfile: _reader = csv.reader(csvfile) for row in _reader: g_set.add(row[0].lower()) logging.info('Done!') return gazzetters
def load_from_sheets(): drive_folder = opj(ROOT, 'google_drive') data = {} header = {} keys = ['images', 'events', 'sections', 'press', 'resume', 'poems'] for key in keys: data[key] = None header[key] = CONFIG.get('TSV', key) for filename in os.listdir(drive_folder): ext = os.path.splitext(filename)[1] if ext in ['.csv', '.xlsx', '.ods', '.pdf']: raise ValueError("You must download files as Tab-separated values") elif ext == '.tsv': with io.open(opj(drive_folder, filename), "r", encoding="utf-8") as F: contents = list(csv.reader(F, delimiter=u'\t')) #contents = [[col.replace(u'&',u'&') for col in row] for row in contents] for key in keys: if u';'.join(contents[0]) == header[key]: if data[key] is not None: raise ValueError("Multiple files for %s" % key) data[key] = contents[1:] break else: raise ValueError( "Unrecognized file %s (did the header row change?)" % filename) missing = [key for (key, val) in data.items() if val is None] if missing: raise ValueError("Missing data for %s" % (', '.join(missing))) return data
def header(self): self.f = self.source.open() if self.have_header: line = self.f.readline().strip().strip(self.separator) return csv.reader([line], delimiter=self.separator, quotechar=self.quote).next()
def writeUniqueResults(clustered_dupes, input_file, output_file): # Write our original data back out to a CSV with a new column called # 'Cluster ID' which indicates which records refer to each other. logging.info('saving unique results to: %s' % output_file) cluster_membership = {} for cluster_id, (cluster, score) in enumerate(clustered_dupes): for record_id in cluster: cluster_membership[record_id] = cluster_id unique_record_id = cluster_id + 1 writer = csv.writer(output_file) reader = csv.reader(StringIO(input_file)) heading_row = next(reader) heading_row.insert(0, u'Cluster ID') writer.writerow(heading_row) seen_clusters = set() for row_id, row in enumerate(reader): if row_id in cluster_membership: cluster_id = cluster_membership[row_id] if cluster_id not in seen_clusters: row.insert(0, cluster_id) writer.writerow(row) seen_clusters.add(cluster_id) else: cluster_id = unique_record_id unique_record_id += 1 row.insert(0, cluster_id) writer.writerow(row)
def read_data(include, data): """ read csv and return the table in list. Return None when the include path is invalid. """ if include is None: with io.StringIO(data) as file: raw_table_list = list(csv.reader(file)) else: try: with io.open(str(include)) as file: raw_table_list = list(csv.reader(file)) except IOError: # FileNotFoundError is not in Python2 raw_table_list = None panflute.debug("pantable: file not found from the path", include) return raw_table_list
def readerAssertEqual(self, input, expected_result): with TemporaryFile("w+", newline="") as fileobj: fileobj.write(input) fileobj.seek(0) reader = csv.reader(fileobj, dialect=self.dialect) fields = list(reader) self.assertEqual(fields, expected_result)
def test_read_dict_fieldnames_from_file(self): with TemporaryFile("w+") as fileobj: fileobj.write("f1,f2,f3\r\n1,2,abc\r\n") fileobj.seek(0) reader = csv.DictReader(fileobj, fieldnames=next(csv.reader(fileobj))) self.assertEqual(reader.fieldnames, ["f1", "f2", "f3"]) self.assertEqual(next(reader), {"f1": "1", "f2": "2", "f3": "abc"})
def test_register_kwargs(self): name = 'fedcba' csv.register_dialect(name, delimiter=';') try: self.assertEqual(csv.get_dialect(name).delimiter, ';') self.assertEqual([['X', 'Y', 'Z']], list(csv.reader(['X;Y;Z'], name))) finally: csv.unregister_dialect(name)
def detect_header(inputfile, dialect, fieldnames): """Test if file has a header or not, also returns number of columns in first row""" try: reader = csv.reader(inputfile, dialect) except csv.Error: try: inputfile.seek(0) reader = csv.reader(inputfile, 'default') except csv.Error: inputfile.seek(0) reader = csv.reader(inputfile, 'excel') header = next(reader) columncount = max(len(header), 3) if valid_fieldnames(header): return header return fieldnames[:columncount]
def test_read_dict_fieldnames_from_file(self): with TemporaryFile("w+") as fileobj: fileobj.write("f1,f2,f3\r\n1,2,abc\r\n") fileobj.seek(0) reader = csv.DictReader(fileobj, fieldnames=next(csv.reader(fileobj))) self.assertEqual(reader.fieldnames, ["f1", "f2", "f3"]) self.assertEqual(next(reader), {"f1": '1', "f2": '2', "f3": 'abc'})
def test_roundtrip_escaped_unquoted_newlines(self): with TemporaryFile("w+", newline="") as fileobj: writer = csv.writer(fileobj, quoting=csv.QUOTE_NONE, escapechar="\\") rows = [["a\nb", "b"], ["c", "x\r\nd"]] writer.writerows(rows) fileobj.seek(0) for i, row in enumerate(csv.reader(fileobj, quoting=csv.QUOTE_NONE, escapechar="\\")): self.assertEqual(row, rows[i])
def test_register_kwargs(self): name = "fedcba" csv.register_dialect(name, delimiter=";") try: self.assertEqual(csv.get_dialect(name).delimiter, ";") self.assertEqual([["X", "Y", "Z"]], list(csv.reader(["X;Y;Z"], name))) finally: csv.unregister_dialect(name)
def writeLinkedResults(clustered_pairs, input_1, input_2, output_file, inner_join=False): logging.info('saving unique results to: %s' % output_file) matched_records = [] seen_1 = set() seen_2 = set() input_1 = [row for row in csv.reader(StringIO(input_1))] row_header = input_1.pop(0) length_1 = len(row_header) input_2 = [row for row in csv.reader(StringIO(input_2))] row_header_2 = input_2.pop(0) length_2 = len(row_header_2) row_header += row_header_2 for pair in clustered_pairs: index_1, index_2 = [int(index.split('|', 1)[1]) for index in pair[0]] matched_records.append(input_1[index_1] + input_2[index_2]) seen_1.add(index_1) seen_2.add(index_2) writer = csv.writer(output_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) writer.writerow(row_header) for matches in matched_records: writer.writerow(matches) if not inner_join: for i, row in enumerate(input_1): if i not in seen_1: writer.writerow(row + [None] * length_2) for i, row in enumerate(input_2): if i not in seen_2: writer.writerow([None] * length_1 + row)
def test_roundtrip_quoteed_newlines(self): with TemporaryFile("w+", newline="") as fileobj: writer = csv.writer(fileobj) self.assertRaises(TypeError, writer.writerows, None) rows = [["a\nb", "b"], ["c", "x\r\nd"]] writer.writerows(rows) fileobj.seek(0) for i, row in enumerate(csv.reader(fileobj)): self.assertEqual(row, rows[i])
def detect_header(sample, dialect, fieldnames): """Test if file has a header or not, also returns number of columns in first row""" inputfile = csv.StringIO(sample) try: reader = csv.reader(inputfile, dialect) except csv.Error: try: inputfile.seek(0) reader = csv.reader(inputfile, 'default') except csv.Error: inputfile.seek(0) reader = csv.reader(inputfile, 'excel') header = next(reader) columncount = max(len(header), 3) if valid_fieldnames(header): return header return fieldnames[:columncount]
def test_roundtrip_quoteed_newlines(self): with TemporaryFile("w+", newline='') as fileobj: writer = csv.writer(fileobj) self.assertRaises(TypeError, writer.writerows, None) rows = [['a\nb', 'b'], ['c', 'x\r\nd']] writer.writerows(rows) fileobj.seek(0) for i, row in enumerate(csv.reader(fileobj)): self.assertEqual(row, rows[i])
def test_read_linenum(self): r = csv.reader(["line,1", "line,2", "line,3"]) self.assertEqual(r.line_num, 0) next(r) self.assertEqual(r.line_num, 1) next(r) self.assertEqual(r.line_num, 2) next(r) self.assertEqual(r.line_num, 3) self.assertRaises(StopIteration, next, r) self.assertEqual(r.line_num, 3)
def iterate_rows(filepath, headers): with io.open(filepath, encoding='utf-8') as f: reader = csv.reader(f, delimiter=self.separator, escapechar=self.escapeCharacter, lineterminator=self.lineEnd, quotechar=self.quoteCharacter) if self.header: header = next(reader) for row in reader: yield cast_values(row, headers)
def test_read_linenum(self): r = csv.reader(['line,1', 'line,2', 'line,3']) self.assertEqual(r.line_num, 0) next(r) self.assertEqual(r.line_num, 1) next(r) self.assertEqual(r.line_num, 2) next(r) self.assertEqual(r.line_num, 3) self.assertRaises(StopIteration, next, r) self.assertEqual(r.line_num, 3)
def run(self): """ Iterate over the input tweets and for each one extract several features. It writes the two files needed for the NER experiments """ count = 0 max_len = 0 with io.open(self.input_file, newline='', encoding='utf-8') as inf: _reader = csv.reader(inf) next(_reader) # Iterate over User Generate Tweets for row in _reader: tweet_id, creation_date, text = row[0:3] # Break if limit is reached if self.limit and count > self.limit - 1: break if count == 0: logging.info("Processing tweets...") elif count % 250 == 0: logging.info("Processed %d tweets", count) count += 1 # Extract POS and Chunk TAG self.tokens_tagged = self.tagger.tag_sentence(text) # Get max tweet lenght for normalization if len(self.tokens_tagged) > max_len: max_len = len(self.tokens_tagged) # Add Entities annotations to tokens self.get_entities_annotated(tweet_id, text) # Add Boolean features to tokens self.get_boolean_features() # Add tweet_id for i, token_tagged in enumerate(self.tokens_tagged): self.tokens_tagged[i] = token_tagged + (tweet_id, ) self.out_tokens += self.tokens_tagged logging.info("Processed %d tweets", count) logging.info("Done!") # Add contextual features to tokens self.get_contextual_features() self.normalize_position(max_len) # Write output files self.write_weka() self.write_NN()
def __init__(self, config, logger = None): Analyser_Merge_Dynamic.__init__(self, config, logger) with open("merge_data/pitch_FR.mapping.csv") as mappingfile: spamreader = csv.reader(mappingfile) for row in spamreader: classs, topic = row[0:2] tags = list(map(lambda t: t.split('=') if t else None, row[2:5])) osmTags = dict(filter(lambda t: t, tags[0:2])) if len(osmTags) > 0: defaultTags = dict(filter(lambda t: t, tags[2:3])) self.classFactory(SubAnalyser_Merge_Pitch_FR, classs, classs, topic, osmTags, defaultTags)
def test_space_dialect(self): class space(csv.excel): delimiter = " " quoting = csv.QUOTE_NONE escapechar = "\\" with TemporaryFile("w+") as fileobj: fileobj.write("abc def\nc1ccccc1 benzene\n") fileobj.seek(0) reader = csv.reader(fileobj, dialect=space()) self.assertEqual(next(reader), ["abc", "def"]) self.assertEqual(next(reader), ["c1ccccc1", "benzene"])
def parse(data, path, project, id=None): desc = basic_description(data, 'matrix', path) desc['rowtype'] = data.get('rowtype', 'unknown') desc['coltype'] = data.get('coltype', 'unknown') desc['value'] = dict(type=data.get('value_type', 'real')) for k, v in data.items(): if k not in desc: desc[k] = v if id is not None: desc['id'] = id if all( (k in data) for k in ['size0', 'size1', 'value_min', 'value_max']): desc['size'] = [int(data['size0']), int(data['size1'])] del desc['size0'] del desc['size1'] desc['value']['range'] = [ float(data['value_min']), float(data['value_max']) ] del desc['value_min'] del desc['value_max'] else: # derive from the data rows = 0 cols = None min_v = None max_v = None with io.open(path, 'r', newline='', encoding=desc.get('encoding', 'utf-8')) as csvfile: reader = csv.reader(csvfile, delimiter=desc.get('separator', u','), quotechar=str(desc.get('quotechar', u'|'))) for row in reader: if cols is None: cols = len(row) - 1 else: rows += 1 min_act = min((float(f) for f in row[1:])) min_v = min_act if min_v is None else min( min_act, min_v) max_act = max((float(f) for f in row[1:])) max_v = max_act if max_v is None else max( max_act, max_v) desc['size'] = [rows, cols] desc['value']['range'] = [ float(data['value_min']) if 'value_min' in data else min_v, float(data['value_max']) if 'value_max' in data else max_v ] return CSVMatrix(desc, project)
def importdata(request, formclass=ImportDataForm): """Generic import function As the process of importing data from a CSV file is the same whatever the type, we do a maximum of the work here. :param request: a ``Request`` instance :param typ: a string indicating the object type being imported :return: a ``Response`` instance """ error = None form = formclass(request.POST, request.FILES) if form.is_valid(): try: infile = io.TextIOWrapper(request.FILES['sourcefile'].file, encoding="utf8") reader = csv.reader(infile, delimiter=form.cleaned_data['sepchar']) except csv.Error as inst: error = smart_text(inst) else: try: cpt = 0 for row in reader: if not row: continue fct = signals.import_object.send(sender="importdata", objtype=row[0].strip()) fct = [func for x_, func in fct if func is not None] if not fct: continue fct = fct[0] with transaction.atomic(): try: fct(request.user, row, form.cleaned_data) except Conflict: if form.cleaned_data["continue_if_exists"]: continue raise Conflict( _("Object already exists: %s" % form.cleaned_data['sepchar'].join(row[:2]))) cpt += 1 msg = _("%d objects imported successfully" % cpt) return render(request, "admin/import_done.html", { "status": "ok", "msg": msg }) except (ModoboaException) as e: error = str(e) return render(request, "admin/import_done.html", { "status": "ko", "msg": error })
def importdata(request, formclass=ImportDataForm): """Generic import function As the process of importing data from a CSV file is the same whatever the type, we do a maximum of the work here. :param request: a ``Request`` instance :param typ: a string indicating the object type being imported :return: a ``Response`` instance """ error = None form = formclass(request.POST, request.FILES) if form.is_valid(): try: infile = io.TextIOWrapper( request.FILES["sourcefile"].file, encoding="utf8") reader = csv.reader(infile, delimiter=form.cleaned_data["sepchar"]) except csv.Error as inst: error = smart_text(inst) else: try: cpt = 0 for row in reader: if not row: continue fct = signals.import_object.send( sender="importdata", objtype=row[0].strip()) fct = [func for x_, func in fct if func is not None] if not fct: continue fct = fct[0] with transaction.atomic(): try: fct(request.user, row, form.cleaned_data) except Conflict: if form.cleaned_data["continue_if_exists"]: continue raise Conflict( _("Object already exists: %s" % form.cleaned_data["sepchar"].join(row[:2])) ) cpt += 1 msg = _("%d objects imported successfully" % cpt) return render(request, "admin/import_done.html", { "status": "ok", "msg": msg }) except (ModoboaException) as e: error = str(e) return render(request, "admin/import_done.html", { "status": "ko", "msg": error })
def __init__(self, config, logger = None): Analyser_Merge_Dynamic.__init__(self, config, logger) with open("merge_data/geodesie_support_FR.mapping.csv") as mappingfile: spamreader = csv.reader(mappingfile, delimiter=u';') for row in spamreader: item, classs, level, topic = row[0:4] tags = list(map(lambda t: t.split('=') if t else None, row[4:7])) osmTags = dict(filter(lambda t: t, tags[0:2])) if len(osmTags) > 0: defaultTags = dict(filter(lambda t: t, tags[2:3])) slug = u''.join(filter(lambda x: x.isalpha(), topic.split('|')[0])).capitalize().encode('ascii', 'ignore').decode('utf8') self.classFactory(SubAnalyser_Geodesie_Support_FR, slug, item, classs, level, topic, osmTags, defaultTags)
def writeLinkedResults(clustered_pairs, input_1, input_2, output_file, inner_join=False): logging.info('saving unique results to: %s' % output_file) matched_records = [] seen_1 = set() seen_2 = set() input_1 = [row for row in csv.reader(StringIO(input_1))] row_header = input_1.pop(0) length_1 = len(row_header) input_2 = [row for row in csv.reader(StringIO(input_2))] row_header_2 = input_2.pop(0) length_2 = len(row_header_2) row_header += row_header_2 for pair in clustered_pairs: index_1, index_2 = [int(index.split('|', 1)[1]) for index in pair[0]] matched_records.append(input_1[index_1] + input_2[index_2]) seen_1.add(index_1) seen_2.add(index_2) writer = csv.writer(output_file) writer.writerow(row_header) for matches in matched_records: writer.writerow(matches) if not inner_join: for i, row in enumerate(input_1): if i not in seen_1: writer.writerow(row + [None] * length_2) for i, row in enumerate(input_2): if i not in seen_2: writer.writerow([None] * length_1 + row)
def __init__(self, config, logger = None): Analyser_Merge_Dynamic.__init__(self, config, logger) with open("merge_data/shop_FR.mapping.csv") as mappingfile: spamreader = csv.reader(mappingfile) for row in spamreader: if row[0][0] == '#': continue items, classs, level, title = row[0:4] items = items.split('|') level = int(level) osmTags = filter(lambda a: a, map(lambda t: (t.split('=') + [None])[0:2] if t else None, row[4:])) if len(osmTags) > 0: self.classFactory(SubAnalyser_Merge_Shop_FR, classs, items, classs, level, title, dict(osmTags), dict(filter(lambda a: a[1], osmTags)))
def from_table_query(cls, synapse, query, quoteCharacter='"', escapeCharacter="\\", lineEnd=str(os.linesep), separator=",", header=True, includeRowIdAndRowVersion=True): """ Create a Table object wrapping a CSV file resulting from querying a Synapse table. Mostly for internal use. """ download_from_table_result, file_info = synapse._queryTableCsv( query=query, quoteCharacter=quoteCharacter, escapeCharacter=escapeCharacter, lineEnd=lineEnd, separator=separator, header=header, includeRowIdAndRowVersion=includeRowIdAndRowVersion) ## A dirty hack to find out if we got back row ID and Version ## in particular, we don't get these back from aggregate queries with io.open(file_info['path'], 'r', encoding='utf-8') as f: reader = csv.reader(f, delimiter=separator, escapechar=escapeCharacter, lineterminator=lineEnd, quotechar=quoteCharacter) first_line = next(reader) if len(download_from_table_result['headers']) + 2 == len(first_line): includeRowIdAndRowVersion = True else: includeRowIdAndRowVersion = False self = cls( filepath=file_info['path'], schema=download_from_table_result.get('tableId', None), etag=download_from_table_result.get('etag', None), quoteCharacter=quoteCharacter, escapeCharacter=escapeCharacter, lineEnd=lineEnd, separator=separator, header=header, includeRowIdAndRowVersion=includeRowIdAndRowVersion, headers=[SelectColumn(**header) for header in download_from_table_result['headers']]) return self
def _import(self, filename, options, encoding="utf-8"): """Import domains or identities.""" superadmin = ( core_models.User.objects.filter(is_superuser=True).first() ) if not os.path.isfile(filename): raise CommandError("File not found") num_lines = sum( 1 for line in io.open(filename, encoding=encoding) if line ) pbar = progressbar.ProgressBar( widgets=[ progressbar.Percentage(), progressbar.Bar(), progressbar.ETA() ], maxval=num_lines ).start() with io.open(filename, encoding=encoding, newline="") as f: reader = csv.reader(f, delimiter=options["sepchar"]) i = 0 for row in reader: if not row: continue fct = signals.import_object.send( sender=self.__class__, objtype=row[0].strip()) fct = [func for x_, func in fct if func is not None] if not fct: continue fct = fct[0] try: fct(superadmin, row, options) except Conflict: if options["continue_if_exists"]: continue raise CommandError( "Object already exists: {}".format( options["sepchar"].join(row[:2]))) i += 1 pbar.update(i) pbar.finish()
def open(self): # Cheat the parent open encoding, self.encoding = self.encoding, 'UTF-8' f = Source.open(self) csvreader = csv.reader(f, delimiter=u';') structureet = [u'nofinesset,nofinessej,rs,rslongue,complrs,compldistrib,numvoie,typvoie,voie,compvoie,lieuditbp,commune,departement,libdepartement,ligneacheminement,telephone,telecopie,categetab,libcategetab,categagretab,libcategagretab,siret,codeape,codemft,libmft,codesph,libsph,dateouv,dateautor,datemaj,numuai,coordxet,coordyet,sourcecoordet,datemajcoord'.split(',')] geolocalisation = {} for row in csvreader: if row[0] == 'structureet': structureet.append(row[1:]) elif row[0] == 'geolocalisation': geolocalisation[row[1]] = row[2:] for row in structureet: row += geolocalisation.get(row[0], []) csvfile = io.StringIO() writer = csv.writer(csvfile) for row in structureet: writer.writerow(row) csvfile.seek(0) return csvfile
"STL", "BLK", "PF", ] ) sys.exit(0) pwd = sys.argv[1] os.chdir(pwd) parts = pwd.rstrip("\r\n").split("/") game, season, division, team, date = parts[-5:] date = date.replace(".", "/") with open("Info.csv") as f: data = list(csv.reader(f))[1] if len(data) == 2: time, officials = data location = "" attendance = "" elif len(data) == 3: time, location, officials = data attendance = "" else: time, location, attendance, officials = data attendance = attendance.replace(",", "") time_struct, _ = parsedatetime.Calendar().parse(time) time = "{:02d}:{:02d}".format(*(time_struct[3:5]))
import io import sys import backports.csv as csv import html2text from markdown import markdown CSV_FILE_NAME = 'forums.csv' # the file to import NEW_CSV_FILE_NAME = 'forums_markdown.csv' # the file to create # some content fields are bigger than csv.field_size_limit csv.field_size_limit(sys.maxsize) with io.open(CSV_FILE_NAME, 'r') as csvfile, io.open(NEW_CSV_FILE_NAME, 'w') as writecsvfile: reader = csv.reader(csvfile, delimiter=u',', quotechar=u'"') writer = csv.writer(writecsvfile) counter = 0 for row in reader: col_number = 0 my_row = [] for col in row: if col_number == 3: # use the permalink as the file name title = col col_number = col_number + 1 if col_number == 6:# & counter != 0: # aha, a content field! h = html2text.HTML2Text() markdown_col = h.handle(col)
def read_csv(fname): with iopen(fname, newline='', encoding='utf-8-sig') as f: for row in reader(f): yield row
def _read_test(self, input, expect, **kwargs): reader = csv.reader(input, **kwargs) result = list(reader) self.assertEqual(result, expect)
def _read_csv(file_path, delimiter=DEFAULT_DELIMITER): return csv.reader( read_lines(file_path), delimiter=delimiter )