Exemplo n.º 1
0
    def ingest_from_directory(self, path):
        """
        Ingests NSRL CSV files from the passed directory path which have
        been extracted from extracted from a NSRL ISO image.
        This includes reading from the compressed csv of the NSRLFile.txt.zip
        file.

        :param str path: path to directory containing files to ingest
        """
        filenames = os.listdir(path)
        fmap = self._get_dir_ingest_filenames(filenames)

        # Ingest mfg, os and prod (in that order).
        for label, key, meth in [("mfg", "NSRLMfg.txt", "put_manufacturers"),
                                 ("os", "NSRLOs.txt", "put_oss"),
                                 ("prod", "NSRLProd.txt", "put_products")]:
            self.print("Inserting %s info..." % label, end=" ")
            s = time.time()
            with open(os.path.join(path, fmap[key]), "rb") as fh:
                reader = csv.reader(binfile_utf8_readlines(fh))
                res = getattr(self.client, meth)(reader)
            e = time.time()
            self.print("done! Put %d in %fs" % (len(res), e - s))

        # Finally, ingest file information.
        self.print("Inserting file info...")
        s = time.time()
        with zipfile.ZipFile(os.path.join(path,
                                          fmap["NSRLFile.txt.zip"])) as zf:
            reader = csv.reader(
                _zipped_file_readlines(zf, "NSRLFile.txt", skip_first=True))
            count = self.client.put_files(reader, verbose=self._verbose)
        e = time.time()
        self.print("File ingest done! Put %d in %fs" % (count, e - s))
Exemplo n.º 2
0
    def process_item(self, item, spider):
        isin = spider.isin

        reader = csv.reader(io.StringIO(item['csv']), delimiter=';')
        result = []

        next(reader)  # Skip header

        for line in reader:
            if not line:
                continue

            day = line[0]
            value = line[4]
            result.append((day, value))

        min_time = result[0][0]
        max_time = result[-1][0]

        output_filename = self.save_file.format(isin, min_time, max_time)

        with io.open(output_filename, 'w', newline='',
                     encoding='UTF-8') as output:
            writer = csv.writer(output, delimiter=';')
            writer.writerow(['Datum', 'Kurs'])
            for t in result:
                writer.writerow([t[0], t[1]])
Exemplo n.º 3
0
    def parse(data, path, project, id=None):
        desc = basic_description(data, 'stratification', path)
        desc['idtype'] = data.get('idtype', data.get('rowtype', 'unknown'))

        for k, v in data.items():
            if k not in desc:
                desc[k] = v
        if id is not None:
            desc['id'] = id
        if 'size0' in data and 'ngroups' in data:
            desc['size'] = [int(data['size0'])]
            del desc['size0']
            desc['ngroups'] = int(data['ngroups'])
        else:  # derive from the data
            clusters = set()
            count = 0
            with io.open(path,
                         'r',
                         newline='',
                         encoding=desc.get('encoding', 'utf-8')) as csvfile:
                reader = csv.reader(csvfile,
                                    delimiter=desc.get('separator', u','),
                                    quotechar=str(desc.get('quotechar', u'|')))
                for row in reader:
                    count += 1
                    clusters.add(row[1])
            desc['size'] = [count]
            desc['ngroups'] = len(clusters)

        return CSVStratification(desc, project)
Exemplo n.º 4
0
def check_duplicated_columns(location):
    """
    Return a list of errors for duplicated column names in a CSV file
    at location.
    """
    location = add_unc(location)
    # FIXME: why ignore errors?
    with codecs.open(location, 'rb', encoding='utf-8', errors='ignore') as csvfile:
        reader = csv.reader(csvfile)
        columns = next(reader)
        columns = [col for col in columns]

    seen = set()
    dupes = OrderedDict()
    for col in columns:
        c = col.lower()
        if c in seen:
            if c in dupes:
                dupes[c].append(col)
            else:
                dupes[c] = [col]
        seen.add(c.lower())

    errors = []
    if dupes:
        dup_msg = []
        for name, names in dupes.items():
            names = u', '.join(names)
            msg = '%(name)s with %(names)s' % locals()
            dup_msg.append(msg)
        dup_msg = u', '.join(dup_msg)
        msg = ('Duplicated column name(s): %(dup_msg)s\n' % locals() +
               'Please correct the input and re-run.')
        errors.append(Error(ERROR, msg))
    return errors
Exemplo n.º 5
0
 def _read_data(self, pokemon):
   try:
       pokemon = pokemon.replace("(", "").replace(")", "").replace(" ", "_").replace("'", "").lower().title()
       with open('/data/%s_%s.csv' % (pokemon.lower(), self.cup.lower()), encoding='ASCII') as f:
         logging.info("Running simulation for %s" % pokemon)
         result = [row for row in csv.reader(f, delimiter=',')]
         result.pop(0)
         sorted_result = sorted(result, key=lambda x: x[1])[::-1]
         challenger = pokemon.lower().title()
         self.node_data.append({"challenger": challenger, 'challenger_id': stats[challenger]})
         hasher = hashlib.md5()
         hasher.update(challenger.encode('utf-8'))
         color = "#" + hasher.hexdigest()[:6]
         logging.info('Adding %s to list of pokemons' % challenger)
         for entry in sorted_result:
           opponent = entry[0].lower().title().replace("(", "").replace(")", "").replace(" ", "_").replace("'", "").lower().title()
           if opponent not in stats:
               logging.info("%s was not found in stats table" % opponent)
               continue
           if int(entry[2]) > 0:
               self.edge_data.append({
                 'challenger': challenger,
                 'challenger_id': stats[challenger],
                 'opponent': opponent,
                 'opponent_id': stats[opponent],
                 'weight': int(entry[2])/100,
                 'color': color})
       mc.set(self.url, json.dumps({'edge': self.edge_data, 'node': self.node_data}), 86400)
       return True
   except Exception as e:
     traceback.print_exc()
     logging.info(e)
Exemplo n.º 6
0
 def readerAssertEqual(self, input, expected_result):
     with TemporaryFile("w+", newline='') as fileobj:
         fileobj.write(input)
         fileobj.seek(0)
         reader = csv.reader(fileobj, dialect=self.dialect)
         fields = list(reader)
         self.assertEqual(fields, expected_result)
Exemplo n.º 7
0
    def import_schedule(self):
        """
        Import in a Dict the schedule tweets information, using as key
        the tweet ID
        """
        count = 0
        DictSched = {}
        with io.open(self.schedule_file, newline='', encoding='utf-8') as inf:
            _reader = csv.reader(inf)
            next(_reader)
            # Iterate over Schedule Tweets
            for row in _reader:
                sch_tweet_id, created_at, text = row[:3]
                if len(row) > 3:
                    entities = row[3:]

                if self.limit and count == self.limit:
                    break
                if count == 0:
                    logging.info("Processing Schedule Tweets...")
                elif count % 1000 == 0:
                    logging.info("Processed %d Schedule Tweets", count)
                count += 1

                DictSched[sch_tweet_id] = {}
                DictSched[sch_tweet_id]['text'] = text
                DictSched[sch_tweet_id]['entities'] = (
                    self.extract_schedule_entities(text, entities))
                DictSched[sch_tweet_id]['created_at'] = parse(created_at)

        logging.info("Done!")
        return DictSched
Exemplo n.º 8
0
def writeResults(clustered_dupes, input_file, output_file):

    # Write our original data back out to a CSV with a new column called
    # 'Cluster ID' which indicates which records refer to each other.

    logging.info('saving results to: %s' % output_file)

    cluster_membership = {}
    for cluster_id, (cluster, score) in enumerate(clustered_dupes):
        for record_id in cluster:
            cluster_membership[record_id] = cluster_id

    unique_record_id = cluster_id + 1

    writer = csv.writer(output_file)

    reader = csv.reader(StringIO(input_file))

    heading_row = next(reader)
    heading_row.insert(0, u'Cluster ID')
    writer.writerow(heading_row)

    for row_id, row in enumerate(reader):
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]
        else:
            cluster_id = unique_record_id
            unique_record_id += 1
        row.insert(0, cluster_id)
        writer.writerow(row)
Exemplo n.º 9
0
        def from_csv(table):
            filepath = os.path.join(dirpath, table.name + ".csv")
            with io.open(filepath, mode="r", encoding="utf-8") as csv_file:
                csv_reader = csv.reader(csv_file)
                header = next(csv_reader)
                conn = self.db.connect()
                with self.lock, conn.begin():
                    for row in csv_reader:
                        values = dict(zip(header, row))

                        if self.engine_dialect_is(self.DIALECT_SQLITE):
                            identifier = values[table.c.id.name]
                            # try to update entry
                            stmt = update(table).values(values).where(
                                table.c.id == identifier)
                            if conn.execute(stmt).rowcount == 0:
                                # identifier doesn't match any => insert new entry
                                stmt = insert(table).values(values)
                                conn.execute(stmt)
                        elif self.engine_dialect_is(self.DIALECT_POSTGRESQL):
                            stmt = pg_insert(table).values(values)\
                                .on_conflict_do_update(index_elements=[table.c.id], set_=values)
                            conn.execute(stmt)

                    if self.engine_dialect_is(self.DIALECT_POSTGRESQL):
                        # update sequence
                        sql = "SELECT setval('{table}_id_seq', max(id)) FROM {table}".format(
                            table=table.name)
                        conn.execute(text(sql))
                conn.close()
Exemplo n.º 10
0
    def handle(self, *args, **options):

        print os.path.basename(options['filename'])
        os.chdir(os.path.dirname(options['filename']))
        filename = os.path.basename(options['filename'])

        reader = csv.reader(io.open(filename, encoding="utf-8"), delimiter=',')
        reader.next()
        for i, row in enumerate(reader):
            if not row[11]:
                continue

            if row[1] == '名誉会長':
                rank = '00'
            elif row[1] == '会長':
                rank = '01'
            elif row[1] == '部会長':
                rank = '02'
            elif row[1] == '常任理事':
                rank = '03'
            elif row[1] == '監事':
                rank = '04'
            else:
                rank = '05'

            member = Member.objects.get(name=row[2], popular_name=row[3])
            member.rank = rank
            member.save()
Exemplo n.º 11
0
 def test_unicode_read(self):
     import io
     with TemporaryFile("w+", newline='', encoding="utf-8") as fileobj:
         fileobj.write(",".join(self.names) + "\r\n")
         fileobj.seek(0)
         reader = csv.reader(fileobj)
         self.assertEqual(list(reader), [self.names])
        def from_csv(table):
            filepath = os.path.join(dirpath, table.name + ".csv")
            with io.open(filepath, mode="r", encoding="utf-8") as csv_file:
                csv_reader = csv.reader(csv_file)
                header = next(csv_reader)
                with self.lock, self.conn.begin():
                    for row in csv_reader:
                        values = dict(zip(header, row))

                        if self.engine.dialect.name == self.DIALECT_SQLITE:
                            identifier = values[table.c.id]
                            # try to update entry
                            stmt = update(table).values(values).where(
                                table.c.id == identifier)
                            if self.conn.execute(stmt).rowcount == 0:
                                # identifier doesn't match any => insert new entry
                                stmt = insert(table).values(values)
                                self.conn.execute(stmt)
                        elif self.engine.dialect.name == self.DIALECT_POSTGRESQL:
                            stmt = pg_insert(table).values(values)\
                                .on_conflict_do_update(index_elements=[table.c.id], set_=values)
                            self.conn.execute(stmt)

                    if self.DIALECT_POSTGRESQL == self.engine.dialect.name:
                        # update sequences
                        self.conn.execute(
                            text(
                                "SELECT setval('profiles_id_seq', max(id)) FROM profiles"
                            ))
                        self.conn.execute(
                            text(
                                "SELECT setval('spools_id_seq', max(id)) FROM spools"
                            ))
Exemplo n.º 13
0
def label(module, infile, outfile, xml, type=None):

    training_data = data_prep_utils.TrainingData(xml, module)

    reader = csv.reader(infile)
    strings = set(row[0] for row in reader)

    labels = module.LABELS
    if type is None:
        tagger = module.TAGGER
    else:
        tagger = module.TAGGERS[type] or module.TAGGER
    if tagger:
        labeled_list, raw_strings_left = consoleLabel(strings, labels, module,
                                                      type)
    else:
        labeled_list, raw_strings_left = naiveConsoleLabel(
            strings, labels, module)

    training_data.extend(labeled_list)

    with open(outfile, 'wb'):
        training_data.write(outfile)

    file_slug = os.path.basename(infile.name)
    if not file_slug.startswith('unlabeled_'):
        file_slug = 'unlabeled_' + file_slug
    remainder_file = os.path.dirname(infile.name) + file_slug

    data_prep_utils.list2file(raw_strings_left, remainder_file)
Exemplo n.º 14
0
    def load(self):
        if self._loaded is not None:
            return self._loaded

        data = []
        with io.open(self._path,
                     'r',
                     newline='',
                     encoding=self._desc.get('encoding', 'utf-8')) as csvfile:
            reader = csv.reader(csvfile,
                                delimiter=self._desc.get('separator', u','),
                                quotechar=str(self._desc.get(
                                    'quotechar', u'|')))
            data.extend(reader)

        # print data
        def to_num(s):
            try:
                return float(s)  # for int, long and float
            except ValueError:
                return s

        header = data[0]
        data = [[to_num(v) if i > 0 else v for i, v in enumerate(row)]
                for row in data[1:]]
        data.insert(0, header)

        # convert to col, row and data
        self._loaded = self._process(data)
        return self._loaded
Exemplo n.º 15
0
    def open(self):
        # Cheat the parent open
        self.encoding = 'UTF-8'
        f = Source.open(self)

        csvreader = csv.reader(f, delimiter=u';')
        structureet = [
            u'nofinesset,nofinessej,rs,rslongue,complrs,compldistrib,numvoie,typvoie,voie,compvoie,lieuditbp,commune,departement,libdepartement,ligneacheminement,telephone,telecopie,categetab,libcategetab,categagretab,libcategagretab,siret,codeape,codemft,libmft,codesph,libsph,dateouv,dateautor,datemaj,numuai,coordxet,coordyet,sourcecoordet,datemajcoord'
            .split(',')
        ]
        geolocalisation = {}
        for row in csvreader:
            if row[0] == 'structureet':
                structureet.append(row[1:])
            elif row[0] == 'geolocalisation':
                geolocalisation[row[1]] = row[2:]
        for row in structureet:
            row += geolocalisation.get(row[0], [])

        csvfile = io.StringIO()
        writer = csv.writer(csvfile)
        for row in structureet:
            writer.writerow(row)
        csvfile.seek(0)

        return csvfile
Exemplo n.º 16
0
    def import_gazetters(self):
        """
        Create a set for each gazeetter defined in the config file
        """
        logging.info('Importing Gazeetters...')
        firstnames = set()
        lastnames = set()
        instrtypes = set()
        voicetypes = set()
        worktypes = set()
        notes = set()
        modes = set()
        opus = set()
        numbers = set()

        gazzetters = [(self.cfg_feat['FIRST_NAMES_GAZ'], firstnames),
                      (self.cfg_feat['LAST_NAMES_GAZ'], lastnames),
                      (self.cfg_feat['CONTR_TYPES_GAZ'], voicetypes),
                      (self.cfg_feat['INTRUMENT_TYPES_GAZ'], instrtypes),
                      (self.cfg_feat['WORK_TYPES_GAZ'], worktypes),
                      (self.cfg_feat['NOTES_GAZ'], notes),
                      (self.cfg_feat['MODES_GAZ'], modes),
                      (self.cfg_feat['OPUS_GAZ'], opus),
                      (self.cfg_feat['NUMBER_GAZ'], numbers)]

        for g_file, g_set in gazzetters:
            with io.open(g_file, newline='', encoding='utf-8') as csvfile:
                _reader = csv.reader(csvfile)
                for row in _reader:
                    g_set.add(row[0].lower())

        logging.info('Done!')
        return gazzetters
Exemplo n.º 17
0
def load_from_sheets():
    drive_folder = opj(ROOT, 'google_drive')
    data = {}
    header = {}
    keys = ['images', 'events', 'sections', 'press', 'resume', 'poems']
    for key in keys:
        data[key] = None
        header[key] = CONFIG.get('TSV', key)
    for filename in os.listdir(drive_folder):
        ext = os.path.splitext(filename)[1]
        if ext in ['.csv', '.xlsx', '.ods', '.pdf']:
            raise ValueError("You must download files as Tab-separated values")
        elif ext == '.tsv':
            with io.open(opj(drive_folder, filename), "r",
                         encoding="utf-8") as F:
                contents = list(csv.reader(F, delimiter=u'\t'))
                #contents = [[col.replace(u'&',u'&') for col in row] for row in contents]
            for key in keys:
                if u';'.join(contents[0]) == header[key]:
                    if data[key] is not None:
                        raise ValueError("Multiple files for %s" % key)
                    data[key] = contents[1:]
                    break
            else:
                raise ValueError(
                    "Unrecognized file %s (did the header row change?)" %
                    filename)
    missing = [key for (key, val) in data.items() if val is None]
    if missing:
        raise ValueError("Missing data for %s" % (', '.join(missing)))
    return data
Exemplo n.º 18
0
 def header(self):
     self.f = self.source.open()
     if self.have_header:
         line = self.f.readline().strip().strip(self.separator)
         return csv.reader([line],
                           delimiter=self.separator,
                           quotechar=self.quote).next()
Exemplo n.º 19
0
def writeUniqueResults(clustered_dupes, input_file, output_file):

    # Write our original data back out to a CSV with a new column called 
    # 'Cluster ID' which indicates which records refer to each other.

    logging.info('saving unique results to: %s' % output_file)

    cluster_membership = {}
    for cluster_id, (cluster, score) in enumerate(clustered_dupes):
        for record_id in cluster:
            cluster_membership[record_id] = cluster_id

    unique_record_id = cluster_id + 1

    writer = csv.writer(output_file)

    reader = csv.reader(StringIO(input_file))

    heading_row = next(reader)
    heading_row.insert(0, u'Cluster ID')
    writer.writerow(heading_row)

    seen_clusters = set()
    for row_id, row in enumerate(reader):
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]
            if cluster_id not in seen_clusters:
                row.insert(0, cluster_id)
                writer.writerow(row)
                seen_clusters.add(cluster_id)
        else:
            cluster_id = unique_record_id
            unique_record_id += 1
            row.insert(0, cluster_id)
            writer.writerow(row)
Exemplo n.º 20
0
def read_data(include, data):
    """
    read csv and return the table in list.
    Return None when the include path is invalid.
    """
    if include is None:
        with io.StringIO(data) as file:
            raw_table_list = list(csv.reader(file))
    else:
        try:
            with io.open(str(include)) as file:
                raw_table_list = list(csv.reader(file))
        except IOError:  # FileNotFoundError is not in Python2
            raw_table_list = None
            panflute.debug("pantable: file not found from the path", include)
    return raw_table_list
Exemplo n.º 21
0
 def readerAssertEqual(self, input, expected_result):
     with TemporaryFile("w+", newline="") as fileobj:
         fileobj.write(input)
         fileobj.seek(0)
         reader = csv.reader(fileobj, dialect=self.dialect)
         fields = list(reader)
         self.assertEqual(fields, expected_result)
Exemplo n.º 22
0
 def test_read_dict_fieldnames_from_file(self):
     with TemporaryFile("w+") as fileobj:
         fileobj.write("f1,f2,f3\r\n1,2,abc\r\n")
         fileobj.seek(0)
         reader = csv.DictReader(fileobj, fieldnames=next(csv.reader(fileobj)))
         self.assertEqual(reader.fieldnames, ["f1", "f2", "f3"])
         self.assertEqual(next(reader), {"f1": "1", "f2": "2", "f3": "abc"})
Exemplo n.º 23
0
 def test_unicode_read(self):
     import io
     with TemporaryFile("w+", newline='', encoding="utf-8") as fileobj:
         fileobj.write(",".join(self.names) + "\r\n")
         fileobj.seek(0)
         reader = csv.reader(fileobj)
         self.assertEqual(list(reader), [self.names])
Exemplo n.º 24
0
 def test_register_kwargs(self):
     name = 'fedcba'
     csv.register_dialect(name, delimiter=';')
     try:
         self.assertEqual(csv.get_dialect(name).delimiter, ';')
         self.assertEqual([['X', 'Y', 'Z']], list(csv.reader(['X;Y;Z'], name)))
     finally:
         csv.unregister_dialect(name)
Exemplo n.º 25
0
def detect_header(inputfile, dialect, fieldnames):
    """Test if file has a header or not, also returns number of columns in first row"""
    try:
        reader = csv.reader(inputfile, dialect)
    except csv.Error:
        try:
            inputfile.seek(0)
            reader = csv.reader(inputfile, 'default')
        except csv.Error:
            inputfile.seek(0)
            reader = csv.reader(inputfile, 'excel')

    header = next(reader)
    columncount = max(len(header), 3)
    if valid_fieldnames(header):
        return header
    return fieldnames[:columncount]
Exemplo n.º 26
0
 def test_read_dict_fieldnames_from_file(self):
     with TemporaryFile("w+") as fileobj:
         fileobj.write("f1,f2,f3\r\n1,2,abc\r\n")
         fileobj.seek(0)
         reader = csv.DictReader(fileobj,
                                 fieldnames=next(csv.reader(fileobj)))
         self.assertEqual(reader.fieldnames, ["f1", "f2", "f3"])
         self.assertEqual(next(reader), {"f1": '1', "f2": '2', "f3": 'abc'})
Exemplo n.º 27
0
 def test_roundtrip_escaped_unquoted_newlines(self):
     with TemporaryFile("w+", newline="") as fileobj:
         writer = csv.writer(fileobj, quoting=csv.QUOTE_NONE, escapechar="\\")
         rows = [["a\nb", "b"], ["c", "x\r\nd"]]
         writer.writerows(rows)
         fileobj.seek(0)
         for i, row in enumerate(csv.reader(fileobj, quoting=csv.QUOTE_NONE, escapechar="\\")):
             self.assertEqual(row, rows[i])
Exemplo n.º 28
0
 def test_register_kwargs(self):
     name = "fedcba"
     csv.register_dialect(name, delimiter=";")
     try:
         self.assertEqual(csv.get_dialect(name).delimiter, ";")
         self.assertEqual([["X", "Y", "Z"]], list(csv.reader(["X;Y;Z"], name)))
     finally:
         csv.unregister_dialect(name)
Exemplo n.º 29
0
def writeLinkedResults(clustered_pairs,
                       input_1,
                       input_2,
                       output_file,
                       inner_join=False):
    logging.info('saving unique results to: %s' % output_file)

    matched_records = []
    seen_1 = set()
    seen_2 = set()

    input_1 = [row for row in csv.reader(StringIO(input_1))]
    row_header = input_1.pop(0)
    length_1 = len(row_header)

    input_2 = [row for row in csv.reader(StringIO(input_2))]
    row_header_2 = input_2.pop(0)
    length_2 = len(row_header_2)
    row_header += row_header_2

    for pair in clustered_pairs:
        index_1, index_2 = [int(index.split('|', 1)[1]) for index in pair[0]]

        matched_records.append(input_1[index_1] + input_2[index_2])
        seen_1.add(index_1)
        seen_2.add(index_2)

    writer = csv.writer(output_file,
                        delimiter=',',
                        quotechar='"',
                        quoting=csv.QUOTE_NONNUMERIC)
    writer.writerow(row_header)

    for matches in matched_records:
        writer.writerow(matches)

    if not inner_join:

        for i, row in enumerate(input_1):
            if i not in seen_1:
                writer.writerow(row + [None] * length_2)

        for i, row in enumerate(input_2):
            if i not in seen_2:
                writer.writerow([None] * length_1 + row)
Exemplo n.º 30
0
 def test_roundtrip_quoteed_newlines(self):
     with TemporaryFile("w+", newline="") as fileobj:
         writer = csv.writer(fileobj)
         self.assertRaises(TypeError, writer.writerows, None)
         rows = [["a\nb", "b"], ["c", "x\r\nd"]]
         writer.writerows(rows)
         fileobj.seek(0)
         for i, row in enumerate(csv.reader(fileobj)):
             self.assertEqual(row, rows[i])
Exemplo n.º 31
0
def detect_header(sample, dialect, fieldnames):
    """Test if file has a header or not, also returns number of columns in first row"""
    inputfile = csv.StringIO(sample)
    try:
        reader = csv.reader(inputfile, dialect)
    except csv.Error:
        try:
            inputfile.seek(0)
            reader = csv.reader(inputfile, 'default')
        except csv.Error:
            inputfile.seek(0)
            reader = csv.reader(inputfile, 'excel')

    header = next(reader)
    columncount = max(len(header), 3)
    if valid_fieldnames(header):
        return header
    return fieldnames[:columncount]
Exemplo n.º 32
0
 def test_roundtrip_quoteed_newlines(self):
     with TemporaryFile("w+", newline='') as fileobj:
         writer = csv.writer(fileobj)
         self.assertRaises(TypeError, writer.writerows, None)
         rows = [['a\nb', 'b'], ['c', 'x\r\nd']]
         writer.writerows(rows)
         fileobj.seek(0)
         for i, row in enumerate(csv.reader(fileobj)):
             self.assertEqual(row, rows[i])
Exemplo n.º 33
0
 def test_register_kwargs(self):
     name = 'fedcba'
     csv.register_dialect(name, delimiter=';')
     try:
         self.assertEqual(csv.get_dialect(name).delimiter, ';')
         self.assertEqual([['X', 'Y', 'Z']],
                          list(csv.reader(['X;Y;Z'], name)))
     finally:
         csv.unregister_dialect(name)
Exemplo n.º 34
0
 def test_read_linenum(self):
     r = csv.reader(["line,1", "line,2", "line,3"])
     self.assertEqual(r.line_num, 0)
     next(r)
     self.assertEqual(r.line_num, 1)
     next(r)
     self.assertEqual(r.line_num, 2)
     next(r)
     self.assertEqual(r.line_num, 3)
     self.assertRaises(StopIteration, next, r)
     self.assertEqual(r.line_num, 3)
Exemplo n.º 35
0
 def iterate_rows(filepath, headers):
     with io.open(filepath, encoding='utf-8') as f:
         reader = csv.reader(f,
             delimiter=self.separator,
             escapechar=self.escapeCharacter,
             lineterminator=self.lineEnd,
             quotechar=self.quoteCharacter)
         if self.header:
             header = next(reader)
         for row in reader:
             yield cast_values(row, headers)
Exemplo n.º 36
0
 def test_read_linenum(self):
     r = csv.reader(['line,1', 'line,2', 'line,3'])
     self.assertEqual(r.line_num, 0)
     next(r)
     self.assertEqual(r.line_num, 1)
     next(r)
     self.assertEqual(r.line_num, 2)
     next(r)
     self.assertEqual(r.line_num, 3)
     self.assertRaises(StopIteration, next, r)
     self.assertEqual(r.line_num, 3)
Exemplo n.º 37
0
    def run(self):
        """
        Iterate over the input tweets and for each one extract several
        features. It writes the two files needed for the NER experiments
        """
        count = 0
        max_len = 0

        with io.open(self.input_file, newline='', encoding='utf-8') as inf:
            _reader = csv.reader(inf)
            next(_reader)

            # Iterate over User Generate Tweets
            for row in _reader:
                tweet_id, creation_date, text = row[0:3]

                # Break if limit is reached
                if self.limit and count > self.limit - 1:
                    break
                if count == 0:
                    logging.info("Processing tweets...")
                elif count % 250 == 0:
                    logging.info("Processed %d tweets", count)
                count += 1

                # Extract POS and Chunk TAG
                self.tokens_tagged = self.tagger.tag_sentence(text)

                # Get max tweet lenght for normalization
                if len(self.tokens_tagged) > max_len:
                    max_len = len(self.tokens_tagged)

                # Add Entities annotations to tokens
                self.get_entities_annotated(tweet_id, text)

                # Add Boolean features to tokens
                self.get_boolean_features()

                # Add tweet_id
                for i, token_tagged in enumerate(self.tokens_tagged):
                    self.tokens_tagged[i] = token_tagged + (tweet_id, )

                self.out_tokens += self.tokens_tagged

            logging.info("Processed %d tweets", count)
            logging.info("Done!")

            # Add contextual features to tokens
            self.get_contextual_features()
            self.normalize_position(max_len)

            # Write output files
            self.write_weka()
            self.write_NN()
Exemplo n.º 38
0
    def __init__(self, config, logger = None):
        Analyser_Merge_Dynamic.__init__(self, config, logger)

        with open("merge_data/pitch_FR.mapping.csv") as mappingfile:
            spamreader = csv.reader(mappingfile)
            for row in spamreader:
                classs, topic = row[0:2]
                tags = list(map(lambda t: t.split('=') if t else None, row[2:5]))
                osmTags = dict(filter(lambda t: t, tags[0:2]))
                if len(osmTags) > 0:
                    defaultTags = dict(filter(lambda t: t, tags[2:3]))
                    self.classFactory(SubAnalyser_Merge_Pitch_FR, classs, classs, topic, osmTags, defaultTags)
Exemplo n.º 39
0
    def test_space_dialect(self):
        class space(csv.excel):
            delimiter = " "
            quoting = csv.QUOTE_NONE
            escapechar = "\\"

        with TemporaryFile("w+") as fileobj:
            fileobj.write("abc def\nc1ccccc1 benzene\n")
            fileobj.seek(0)
            reader = csv.reader(fileobj, dialect=space())
            self.assertEqual(next(reader), ["abc", "def"])
            self.assertEqual(next(reader), ["c1ccccc1", "benzene"])
Exemplo n.º 40
0
    def parse(data, path, project, id=None):
        desc = basic_description(data, 'matrix', path)
        desc['rowtype'] = data.get('rowtype', 'unknown')
        desc['coltype'] = data.get('coltype', 'unknown')
        desc['value'] = dict(type=data.get('value_type', 'real'))

        for k, v in data.items():
            if k not in desc:
                desc[k] = v
        if id is not None:
            desc['id'] = id

        if all(
            (k in data) for k in ['size0', 'size1', 'value_min', 'value_max']):
            desc['size'] = [int(data['size0']), int(data['size1'])]
            del desc['size0']
            del desc['size1']
            desc['value']['range'] = [
                float(data['value_min']),
                float(data['value_max'])
            ]
            del desc['value_min']
            del desc['value_max']
        else:  # derive from the data
            rows = 0
            cols = None
            min_v = None
            max_v = None
            with io.open(path,
                         'r',
                         newline='',
                         encoding=desc.get('encoding', 'utf-8')) as csvfile:
                reader = csv.reader(csvfile,
                                    delimiter=desc.get('separator', u','),
                                    quotechar=str(desc.get('quotechar', u'|')))
                for row in reader:
                    if cols is None:
                        cols = len(row) - 1
                    else:
                        rows += 1
                        min_act = min((float(f) for f in row[1:]))
                        min_v = min_act if min_v is None else min(
                            min_act, min_v)
                        max_act = max((float(f) for f in row[1:]))
                        max_v = max_act if max_v is None else max(
                            max_act, max_v)
            desc['size'] = [rows, cols]
            desc['value']['range'] = [
                float(data['value_min']) if 'value_min' in data else min_v,
                float(data['value_max']) if 'value_max' in data else max_v
            ]

        return CSVMatrix(desc, project)
Exemplo n.º 41
0
def importdata(request, formclass=ImportDataForm):
    """Generic import function

    As the process of importing data from a CSV file is the same
    whatever the type, we do a maximum of the work here.

    :param request: a ``Request`` instance
    :param typ: a string indicating the object type being imported
    :return: a ``Response`` instance
    """
    error = None
    form = formclass(request.POST, request.FILES)
    if form.is_valid():
        try:
            infile = io.TextIOWrapper(request.FILES['sourcefile'].file,
                                      encoding="utf8")
            reader = csv.reader(infile, delimiter=form.cleaned_data['sepchar'])
        except csv.Error as inst:
            error = smart_text(inst)
        else:
            try:
                cpt = 0
                for row in reader:
                    if not row:
                        continue
                    fct = signals.import_object.send(sender="importdata",
                                                     objtype=row[0].strip())
                    fct = [func for x_, func in fct if func is not None]
                    if not fct:
                        continue
                    fct = fct[0]
                    with transaction.atomic():
                        try:
                            fct(request.user, row, form.cleaned_data)
                        except Conflict:
                            if form.cleaned_data["continue_if_exists"]:
                                continue
                            raise Conflict(
                                _("Object already exists: %s" %
                                  form.cleaned_data['sepchar'].join(row[:2])))
                    cpt += 1
                msg = _("%d objects imported successfully" % cpt)
                return render(request, "admin/import_done.html", {
                    "status": "ok",
                    "msg": msg
                })
            except (ModoboaException) as e:
                error = str(e)

    return render(request, "admin/import_done.html", {
        "status": "ko",
        "msg": error
    })
Exemplo n.º 42
0
    def test_space_dialect(self):
        class space(csv.excel):
            delimiter = " "
            quoting = csv.QUOTE_NONE
            escapechar = "\\"

        with TemporaryFile("w+") as fileobj:
            fileobj.write("abc def\nc1ccccc1 benzene\n")
            fileobj.seek(0)
            reader = csv.reader(fileobj, dialect=space())
            self.assertEqual(next(reader), ["abc", "def"])
            self.assertEqual(next(reader), ["c1ccccc1", "benzene"])
Exemplo n.º 43
0
def importdata(request, formclass=ImportDataForm):
    """Generic import function

    As the process of importing data from a CSV file is the same
    whatever the type, we do a maximum of the work here.

    :param request: a ``Request`` instance
    :param typ: a string indicating the object type being imported
    :return: a ``Response`` instance
    """
    error = None
    form = formclass(request.POST, request.FILES)
    if form.is_valid():
        try:
            infile = io.TextIOWrapper(
                request.FILES["sourcefile"].file, encoding="utf8")
            reader = csv.reader(infile, delimiter=form.cleaned_data["sepchar"])
        except csv.Error as inst:
            error = smart_text(inst)
        else:
            try:
                cpt = 0
                for row in reader:
                    if not row:
                        continue
                    fct = signals.import_object.send(
                        sender="importdata", objtype=row[0].strip())
                    fct = [func for x_, func in fct if func is not None]
                    if not fct:
                        continue
                    fct = fct[0]
                    with transaction.atomic():
                        try:
                            fct(request.user, row, form.cleaned_data)
                        except Conflict:
                            if form.cleaned_data["continue_if_exists"]:
                                continue
                            raise Conflict(
                                _("Object already exists: %s"
                                  % form.cleaned_data["sepchar"].join(row[:2]))
                            )
                    cpt += 1
                msg = _("%d objects imported successfully" % cpt)
                return render(request, "admin/import_done.html", {
                    "status": "ok", "msg": msg
                })
            except (ModoboaException) as e:
                error = str(e)

    return render(request, "admin/import_done.html", {
        "status": "ko", "msg": error
    })
    def __init__(self, config, logger = None):
        Analyser_Merge_Dynamic.__init__(self, config, logger)

        with open("merge_data/geodesie_support_FR.mapping.csv") as mappingfile:
            spamreader = csv.reader(mappingfile,  delimiter=u';')
            for row in spamreader:
                item, classs, level, topic = row[0:4]
                tags = list(map(lambda t: t.split('=') if t else None, row[4:7]))
                osmTags = dict(filter(lambda t: t, tags[0:2]))
                if len(osmTags) > 0:
                    defaultTags = dict(filter(lambda t: t, tags[2:3]))
                    slug = u''.join(filter(lambda x: x.isalpha(), topic.split('|')[0])).capitalize().encode('ascii', 'ignore').decode('utf8')
                    self.classFactory(SubAnalyser_Geodesie_Support_FR, slug, item, classs, level, topic, osmTags, defaultTags)
Exemplo n.º 45
0
def writeLinkedResults(clustered_pairs, input_1, input_2, output_file,
                       inner_join=False):
    logging.info('saving unique results to: %s' % output_file)

    matched_records = []
    seen_1 = set()
    seen_2 = set()

    input_1 = [row for row in csv.reader(StringIO(input_1))]
    row_header = input_1.pop(0)
    length_1 = len(row_header)

    input_2 = [row for row in csv.reader(StringIO(input_2))]
    row_header_2 = input_2.pop(0)
    length_2 = len(row_header_2)
    row_header += row_header_2

    for pair in clustered_pairs:
        index_1, index_2 = [int(index.split('|', 1)[1]) for index in pair[0]]

        matched_records.append(input_1[index_1] + input_2[index_2])
        seen_1.add(index_1)
        seen_2.add(index_2)

    writer = csv.writer(output_file)
    writer.writerow(row_header)

    for matches in matched_records:
        writer.writerow(matches)

    if not inner_join:

        for i, row in enumerate(input_1):
            if i not in seen_1:
                writer.writerow(row + [None] * length_2)

        for i, row in enumerate(input_2):
            if i not in seen_2:
                writer.writerow([None] * length_1 + row)
Exemplo n.º 46
0
    def __init__(self, config, logger = None):
        Analyser_Merge_Dynamic.__init__(self, config, logger)

        with open("merge_data/shop_FR.mapping.csv") as mappingfile:
            spamreader = csv.reader(mappingfile)
            for row in spamreader:
                if row[0][0] == '#':
                    continue
                items, classs, level, title = row[0:4]
                items = items.split('|')
                level = int(level)
                osmTags = filter(lambda a: a, map(lambda t: (t.split('=') + [None])[0:2] if t else None, row[4:]))
                if len(osmTags) > 0:
                    self.classFactory(SubAnalyser_Merge_Shop_FR, classs, items, classs, level, title, dict(osmTags), dict(filter(lambda a: a[1], osmTags)))
Exemplo n.º 47
0
    def from_table_query(cls, synapse, query, quoteCharacter='"', escapeCharacter="\\", lineEnd=str(os.linesep), separator=",", header=True, includeRowIdAndRowVersion=True):
        """
        Create a Table object wrapping a CSV file resulting from querying a Synapse table.
        Mostly for internal use.
        """

        download_from_table_result, file_info = synapse._queryTableCsv(
            query=query,
            quoteCharacter=quoteCharacter,
            escapeCharacter=escapeCharacter,
            lineEnd=lineEnd,
            separator=separator,
            header=header,
            includeRowIdAndRowVersion=includeRowIdAndRowVersion)

        ## A dirty hack to find out if we got back row ID and Version
        ## in particular, we don't get these back from aggregate queries
        with io.open(file_info['path'], 'r', encoding='utf-8') as f:
            reader = csv.reader(f,
                delimiter=separator,
                escapechar=escapeCharacter,
                lineterminator=lineEnd,
                quotechar=quoteCharacter)
            first_line = next(reader)
        if len(download_from_table_result['headers']) + 2 == len(first_line):
            includeRowIdAndRowVersion = True
        else:
            includeRowIdAndRowVersion = False

        self = cls(
            filepath=file_info['path'],
            schema=download_from_table_result.get('tableId', None),
            etag=download_from_table_result.get('etag', None),
            quoteCharacter=quoteCharacter,
            escapeCharacter=escapeCharacter,
            lineEnd=lineEnd,
            separator=separator,
            header=header,
            includeRowIdAndRowVersion=includeRowIdAndRowVersion,
            headers=[SelectColumn(**header) for header in download_from_table_result['headers']])

        return self
Exemplo n.º 48
0
    def _import(self, filename, options, encoding="utf-8"):
        """Import domains or identities."""
        superadmin = (
            core_models.User.objects.filter(is_superuser=True).first()
        )
        if not os.path.isfile(filename):
            raise CommandError("File not found")

        num_lines = sum(
            1 for line in io.open(filename, encoding=encoding) if line
        )
        pbar = progressbar.ProgressBar(
            widgets=[
                progressbar.Percentage(), progressbar.Bar(), progressbar.ETA()
            ], maxval=num_lines
        ).start()
        with io.open(filename, encoding=encoding, newline="") as f:
            reader = csv.reader(f, delimiter=options["sepchar"])
            i = 0
            for row in reader:
                if not row:
                    continue
                fct = signals.import_object.send(
                    sender=self.__class__, objtype=row[0].strip())
                fct = [func for x_, func in fct if func is not None]
                if not fct:
                    continue
                fct = fct[0]
                try:
                    fct(superadmin, row, options)
                except Conflict:
                    if options["continue_if_exists"]:
                        continue
                    raise CommandError(
                        "Object already exists: {}".format(
                            options["sepchar"].join(row[:2])))
                i += 1
                pbar.update(i)

        pbar.finish()
    def open(self):
        # Cheat the parent open
        encoding, self.encoding = self.encoding, 'UTF-8'
        f = Source.open(self)

        csvreader = csv.reader(f, delimiter=u';')
        structureet = [u'nofinesset,nofinessej,rs,rslongue,complrs,compldistrib,numvoie,typvoie,voie,compvoie,lieuditbp,commune,departement,libdepartement,ligneacheminement,telephone,telecopie,categetab,libcategetab,categagretab,libcategagretab,siret,codeape,codemft,libmft,codesph,libsph,dateouv,dateautor,datemaj,numuai,coordxet,coordyet,sourcecoordet,datemajcoord'.split(',')]
        geolocalisation = {}
        for row in csvreader:
            if row[0] == 'structureet':
                structureet.append(row[1:])
            elif row[0] == 'geolocalisation':
                geolocalisation[row[1]] = row[2:]
        for row in structureet:
           row += geolocalisation.get(row[0], [])

        csvfile = io.StringIO()
        writer = csv.writer(csvfile)
        for row in structureet:
            writer.writerow(row)
        csvfile.seek(0)

        return csvfile
Exemplo n.º 50
0
            "STL",
            "BLK",
            "PF",
        ]
    )
    sys.exit(0)

pwd = sys.argv[1]
os.chdir(pwd)

parts = pwd.rstrip("\r\n").split("/")
game, season, division, team, date = parts[-5:]
date = date.replace(".", "/")

with open("Info.csv") as f:
    data = list(csv.reader(f))[1]

    if len(data) == 2:
        time, officials = data
        location = ""
        attendance = ""
    elif len(data) == 3:
        time, location, officials = data
        attendance = ""
    else:
        time, location, attendance, officials = data
        attendance = attendance.replace(",", "")

    time_struct, _ = parsedatetime.Calendar().parse(time)
    time = "{:02d}:{:02d}".format(*(time_struct[3:5]))
Exemplo n.º 51
0
import io
import sys

import backports.csv as csv
import html2text

from markdown import markdown

CSV_FILE_NAME = 'forums.csv' # the file to import
NEW_CSV_FILE_NAME = 'forums_markdown.csv' # the file to create

# some content fields are bigger than csv.field_size_limit
csv.field_size_limit(sys.maxsize)
with io.open(CSV_FILE_NAME, 'r') as csvfile, io.open(NEW_CSV_FILE_NAME, 'w') as writecsvfile:
    reader = csv.reader(csvfile, delimiter=u',', quotechar=u'"')
    writer = csv.writer(writecsvfile)
    counter = 0
    for row in reader:
        col_number = 0
        my_row = []
        for col in row:
            if col_number == 3:
                # use the permalink as the file name
                title = col
                
            col_number = col_number + 1
            if col_number == 6:# & counter != 0:
                # aha, a content field!
                h = html2text.HTML2Text()
                markdown_col = h.handle(col)
Exemplo n.º 52
0
def read_csv(fname):
	with iopen(fname, newline='', encoding='utf-8-sig') as f:
		for row in reader(f):
			yield row
Exemplo n.º 53
0
 def _read_test(self, input, expect, **kwargs):
     reader = csv.reader(input, **kwargs)
     result = list(reader)
     self.assertEqual(result, expect)
Exemplo n.º 54
0
def _read_csv(file_path, delimiter=DEFAULT_DELIMITER):
    return csv.reader(
        read_lines(file_path),
        delimiter=delimiter
    )