Python Sniffer示例，csv.Sniffer Python示例

示例#1

0

显示文件

 def guess_dialect(self, sample):
     sniffer = Sniffer()
     try:
         dialect = sniffer.sniff(sample)
         has_header = sniffer.has_header(sample)
     except Error:  # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone"
         has_header = False  # lets just guess the value
         s = sample.split("\n")[
             1]  # we dont take header (there is no empty column for sure)
         delimiter = ""
         for dl in (",", ";",
                    "|"):  # lets suppose the doubled sign is delimiter
             if s.find(dl + dl) > -1:
                 delimiter = dl
                 break
         if not delimiter:  # try find anything that ressembles delimiter
             for dl in (",", ";", "|"):
                 if s.find(dl) > -1:
                     delimiter = dl
                     break
         dialect = csv.unix_dialect
         dialect.delimiter = delimiter
     if not dialect.escapechar:
         dialect.escapechar = '\\'
     # dialect.quoting = 3
     dialect.doublequote = True
     return dialect, has_header

示例#2

0

显示文件

文件： DataParser.py 项目： axxonback/python_data_analysis_ConversantCodingExercise

    def create_reader(self, csvfile):
        """
        Summary: Validates a csv file, returns a DictReader object.

        Description: Takes one argument: "data" (Should be a csv file)
        """
        # Determines the dialect of the csv file for processing
        file_dialect = Sniffer().sniff(csvfile.read(1024))

        # Resets the read/write pointer within the file
        csvfile.seek(0)

        # Checks to see that the csv file imported has a header row,
        # that will be used for later parsing.
        if not Sniffer().has_header(csvfile.read(1024)):
            print('Imported csv file lacks header row')
            exit()

        # Resets the read/write pointer within the file
        csvfile.seek(0)

        # Creates a DictReader object with the csvfile provided, and the
        # dialect object to define the parameters of the reader instance.
        reader = DictReader(csvfile, dialect=file_dialect)

        # Return DictReader object
        return reader

示例#3

0

显示文件

文件： csvGuesses.py 项目： CZ-NIC/convey

 def guess_dialect(self, sample):
     sniffer = Sniffer()
     try:
         dialect = sniffer.sniff(sample)
         has_header = sniffer.has_header(sample)
     except Error:  # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone"
         if sample.strip() == "":
             print("The file seems empty")
             quit()
         has_header = False  # lets just guess the value
         try:
             s = sample.split("\n")[1]  # we dont take header (there is no empty column for sure)
         except IndexError:  # there is a single line in the file
             s = sample.split("\n")[0]
         delimiter = ""
         for dl in (",", ";", "|"):  # lets suppose the doubled sign is delimiter
             if s.find(dl + dl) > -1:
                 delimiter = dl
                 break
         if not delimiter:  # try find anything that ressembles delimiter
             for dl in (",", ";", "|"):
                 if s.find(dl) > -1:
                     delimiter = dl
                     break
         dialect = csv.unix_dialect
         dialect.delimiter = delimiter
     if not dialect.escapechar:
         dialect.escapechar = '\\'
     # dialect.quoting = 3
     dialect.doublequote = True
     return dialect, has_header

示例#4

0

显示文件

 def __init__(self, inFile):
     from csv import Sniffer, reader
     csvFile = open(inFile, 'r')
     sample = csvFile.read(1024)
     csvFile.seek(0)
     self.reader = reader(csvFile, Sniffer.sniff(sample))
     if Sniffer.has_header(sample):
         self.varNames = next(self.reader)
     else:
         self.varNames = None

示例#5

0

显示文件

文件： ncdr2Import.py 项目： modelica-3rdparty/netCDF-DataReader

 def __init__(self, inFile):
     from csv import Sniffer, reader
     csvFile = open(inFile, 'r')
     sample = csvFile.read(1024)
     csvFile.seek(0)
     self.reader = reader(csvFile, Sniffer.sniff(sample))
     if Sniffer.has_header(sample):
         self.varNames = next(self.reader)
     else:
         self.varNames = None

示例#6

0

显示文件

文件： unicodereader.py 项目： groupserver/gs.group.member.invite.csv

 def guess_dialect(f):
     # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**>
     # <https://docs.python.org/2/library/csv.html#csv.Sniffer>
     s = Sniffer()
     try:
         retval = s.sniff(f.read(1024), [',', '\t', ])  # 1024 taken from the Python docs
     except CSVError:
         retval = 'excel'
     finally:
         f.seek(0)  # The above f.read moves the file-cursor in the CSV file.
     return retval

示例#7

0

显示文件

文件： dataprep.py 项目： openmednlp/boonai

def converter():
    if not session['extension'] or not session['content_type']:
        flash('Unsupported file type', 'info')
        return redirect(url_for('.dropzone'))

    session['processed'] = False
    session['outputs'] = mkdtemp()

    is_csv = (session['extension'] == '.csv'
              or session['content_type'].startswith('text/csv'))
    is_excel = session['extension'] in ['.xls', '.xlsx'] or any(
        s in session['content_type']
        for s in ['spreadsheet', 'xls', 'xlsx', 'excel'])
    is_text = (session['extension'] == '.txt'
               or session['content_type'].startswith('text/'))

    if is_csv:
        file_name = listdir(session['tmp_dir'])[0]
        file_path = join(session['tmp_dir'], file_name)

        # guess file encoding
        encoding = get_encoding(file_path)

        # guess separator
        with open(file_path, encoding=encoding) as f:
            sniffer = Sniffer()
            line = f.readline().encode(encoding).decode('utf-8')
            dialect = sniffer.sniff(line)

        df = pd.read_csv(file_path, encoding=encoding, dialect=dialect)

        session['fields'] = df.columns.tolist()

    elif is_excel:
        file_name = listdir(session['tmp_dir'])[0]
        file_path = join(session['tmp_dir'], file_name)

        df = pd.read_excel(file_path, encoding='utf-8')
        session['fields'] = df.columns.tolist()

    elif is_text:
        session['fields'] = ['id', 'text']
        dataset_json = texts_to_json(session['tmp_dir'])
        df = DataFrame(dataset_json)

    else:
        flash('Uploaded file types have not been recognized')
        return redirect(url_for('.dropzone'))

    df.to_csv(join(session['outputs'], 'original.csv'),
              index=False,
              encoding='utf-8')

    return redirect(url_for('.field_selection_get'))

示例#8

0

显示文件

文件： unicodereader.py 项目： groupserver/gs.group.member.invite.csv

 def guess_dialect(f):
     # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**>
     # <https://docs.python.org/2/library/csv.html#csv.Sniffer>
     s = Sniffer()
     try:
         retval = s.sniff(f.read(1024), [
             ',',
             '\t',
         ])  # 1024 taken from the Python docs
     except CSVError:
         retval = 'excel'
     finally:
         f.seek(
             0)  # The above f.read moves the file-cursor in the CSV file.
     return retval

示例#9

0

显示文件

def analyze_csv(file):
    try:
        dialect = Sniffer().sniff(''.join(read_lines_file(file, config.num_lines_to_check_csv_dialect)))
        log.info(f"delimiter: ({dialect.delimiter}) doublequote: ({dialect.doublequote}) "
                 f"escapechar: ({dialect.escapechar}) "
                 f"lineterminator: ({dialect.lineterminator}) quotechar: ({dialect.quotechar}) "
                 f"quoting: ({dialect.quoting}) "
                 f"skipinitialspace: ({dialect.skipinitialspace})")
    except _csv.Error:
        log.info("Não foi possível determinar o delimitador.")
        return ()
    try:
        file.seek(0)
        csv_file = reader(read_lines_file(file, config.num_lines_to_check_type_of_place), dialect, quoting=QUOTE_ALL)
        csv_file = list(csv_file)
    except _csv.Error as err:
        log.info("<><><><><><><><><><><><><><><>")
        log.info(err)
        return ()
    # Verifica qual o provável tamanho de cada linha do csv
    try:
        len_row = mode([len(x) for x in csv_file[0:]])
        log.info(f"tamanho provável da linha: {len_row}")
    except StatisticsError:
        log.info('Problema ao verifica qual o provável tamanho de cada linha do csv')
        return ()
    time_i = time()
    types_and_indexes_ = types_and_indexes(csv_file, driver)
    time_f = time()
    log.info(f"Tempo para verificação de tipos: {time_f - time_i}")
    if not types_and_indexes_[0]:
        return ()
    file.seek(0)
    return dialect, len_row, types_and_indexes_

示例#10

0

显示文件

def read_certified_applications(csv_file_path):
    """
    Generator function for reading the input CSV files.  This function does the following:

    1.  We sniff the file format so we can support both default excel formatted csv and the
        semi-colon separated files (see Test 1 and Test 2).

    2.  We check if the file uses the previous LCA record layout. If so, we migrate it to the
        current H1B Record Layout.

    3.  Filter out all application that are not 'CERTIFIED'.

    The goal of this function is to separate code for reading the input file format from our
    analytical code.

    :param csv_file_path:
    :return:
    """
    print('Processing input file: {0}'.format(csv_file_path))

    with open(csv_file_path) as csv_file:

        # Sniff the file format
        dialect = Sniffer().sniff(csv_file.read(16384))
        csv_file.seek(0)

        reader = DictReader(csv_file, dialect=dialect)

        # Dirty migration of the previous LCA Record Layout to the current H1B Record Layout
        if 'LCA_CASE_NUMBER' in reader.fieldnames:
            reader.fieldnames = get_migrated_fieldnames()

        # Filter all applications that are not certified
        yield from filterfalse(lambda row: row['CASE_STATUS'] != 'CERTIFIED',
                               reader)

示例#11

0

显示文件

文件： dataframer.py 项目： refinery-platform/dataframer

def sniff(file):
    # The pandas default behavior is to look at filename extensions,
    # but we decided we can't rely on those to be accurate.
    compression = {
        b'\x1f\x8b': 'gzip'
    }.get(file.read(2))
    file.seek(0)

    encoding = 'latin-1'
    if compression:
        peek_window = 1024  # arbitrary
        if compression == 'gzip':
            first_bytes = gzip.open(file).peek(peek_window)
        else:
            raise Exception(
                'Unsupported compression type: {}'.format(compression))
        first_characters = first_bytes.decode(encoding)
    else:
        first_characters = file.readline().decode(encoding)
    is_gct = first_characters.startswith('#1.2')
    dialect = excel_tab if is_gct else Sniffer().sniff(first_characters)

    if search(r'\W', first_characters.split('\n')[0]):
        is_list = False
        as_list = None
    else:
        # No non-word characters in first line
        file.seek(0)
        is_list = True
        as_list = DataFrame({'item': [bytes.decode(encoding) for
                                      bytes in file.read().splitlines()]})
    file.seek(0)
    return SniffResult(compression=compression, is_gct=is_gct, dialect=dialect,
                       is_list=is_list, as_list=as_list)

示例#12

0

显示文件

def load_languages(
    path: FileSystemPath,
    dialect: Optional[DialectLike] = None,
    prefix: str = "",
) -> Dict[str, Language]:
    """Return a dictionnary mapping each column to a language file."""
    with open(path, newline="") as csv_file:
        if not dialect:
            dialect = Sniffer().sniff(csv_file.read(1024))
            csv_file.seek(0)

        reader = DictReader(csv_file, dialect=dialect)

        key, *language_codes = reader.fieldnames or [""]
        languages = {code: Language({}) for code in language_codes}

        for row in reader:
            if not (identifier := row[key]):
                continue

            identifier = prefix + identifier

            for code in language_codes:
                if value := row[code]:
                    languages[code].data[identifier] = value
                else:
                    msg = f"Locale {code!r} has no translation for {identifier!r}"
                    logger.warning(msg)

示例#13

0

显示文件

文件： parser.py 项目： tomudding/2IOA0-DBL-HTI-Webtech

def processCSVMatrix(file):
    with open(file, 'r') as csvfile:
        dialect = Sniffer().sniff(csvfile.readline())

    df = DataFrame()
    for chunk in read_csv(file,
                          sep=dialect.delimiter,
                          mangle_dupe_cols=True,
                          index_col=False,
                          chunksize=1000):
        df = concat([df, chunk], ignore_index=True)

    nodes = df.columns.values.tolist()
    nodes.pop(0)
    df["Unnamed: 0"] = nodes
    df = df.rename(columns={'Unnamed: 0': 'name'})
    df = df.set_index(keys='name')

    # Remove underscores in names
    names = df.columns.tolist()
    names = [name.replace('_', ' ') for name in names]
    df.columns = names
    df.set_index([df.columns], inplace=True)

    return df

示例#14

0

显示文件

文件： first_analytics.py 项目： schmouk/PythonOpenSourceProject

    def __init__(self, filepath: str) -> None:
        '''
       Opens the data file and loads its content.
       
        Args:
            filepath: str
                The path to the data file that contains  keywords
                associated with the collected jobs descriptions.
    
        Raises:
            FileNotFoundError: filepath is not correct or related
                file cannot be found.
            PermissionError: read access to  the  specified  file
                cannot be granted 
        '''
        with open(filepath, 'r') as csv_f:

            # automatically detects separators and internal format of the CSV file
            csv_dialect = Sniffer().sniff(csv_f.read(1024))

            # rewinds it
            csv_f.seek(0)

            # and reads its content
            self.entries = [row for row in reader(csv_f, csv_dialect)]

示例#15

0

显示文件

文件： csv_handler.py 项目： AlexLoar/musicallity

 def _detect_dialect(cls, csv_file: TextIO, dialect: Dialect) -> Dialect:
     number_of_rows = 1024
     start_of_file = 0
     if not dialect:
         dialect = Sniffer().sniff(csv_file.read(number_of_rows))
         csv_file.seek(start_of_file)
     return dialect

示例#16

0

显示文件

文件： qctable.py 项目： aueb-wim/DataQualityControlTool

    def __init__(self, source, schema, **kargs):
        super().__init__(source, **kargs)
        self.__source = source
        self.__filename = os.path.basename(source)
        self.__headers_4_mipmap = OrderedDict()
        # if csv file get headers (tabulator aka csv file)
        if not self._Table__storage:
            # used encoding utf-8-sig to remove the byte order mask (BOM)
            with open(source, 'r', encoding='utf-8-sig') as csv_file:
                # find the dialect of the csv file
                try:
                    dialect = Sniffer().sniff(csv_file.read(1024))
                except:
                    dialect = 'excel'
                # reset the seeker to the start of the file
                csv_file.seek(0)
                reader = DictReader(csv_file, dialect=dialect)
                self.__actual_headers = reader.fieldnames
        else:
            self.__actual_headers = None

        if self.__actual_headers:
            self.__create_headers_4_mipmap()

        # QcSchema
        if isinstance(schema, QcSchema):
            self._Table__schema = schema
            self.__metadata = True
        elif isinstance(schema, dict):
            self._Table__schema = QcSchema(schema)
            self.__metadata = True
        else:
            self.__metadata = False

示例#17

0

显示文件

def trainiere():
    #Sammle Daten aus dem POST
    maxBlattPunkte = request.form["maxBlattPunkte"]
    blattNamen = request.form["blattNamen"]
    maxKlausurPunkte = request.form.get("maxKlausurPunkte")
    klausurPunkteName = request.form.get("klausurPunkteName")
    note = request.form.get("note")

    datei = request.files["daten"]
    inhalt = datei.read()
    sep = Sniffer().sniff(str(inhalt)[:100]).delimiter
    df = pd.read_csv(StringIO(str(inhalt, "utf-8")), sep=sep)
    df.replace(",", ".", regex=True, inplace=True)

    try:
        #Daten werden den ursprünglichen angegleicht und wichtige Features werden gespeichtert
        df = bereiteTraining(df, maxBlattPunkte.split(","), blattNamen,
                             maxKlausurPunkte, klausurPunkteName, note)
        df.to_csv("Server/data/Training/" + datei.filename, index=False)

        #Alle Schätzer werden mit den geänderten Daten neu trainiert
        schaetzer.trainiere()

        flash("Upload erfolgreich!", "erfolg")
    except Exception as e:
        flash("Upload leider nicht erfolgreich: " + str(e), "error")
        return redirect(
            url_for("startseite",
                    maxBlattPunkte=maxBlattPunkte,
                    blattNamen=blattNamen,
                    maxKlausurPunkte=maxKlausurPunkte,
                    klausurPunkteName=klausurPunkteName,
                    note=note))

    return redirect(url_for("startseite"))

示例#18

0

显示文件

文件： importer.py 项目： gaudenz/bosco

    def __init__(self, fname, encoding, verbose=False):

        self._verbose = verbose

        # List of dicts
        self.data = []

        # Set up CSV reader
        fh = open(fname, 'rb')
        try:
            dialect = Sniffer().sniff(fh.read(1024))
            fh.seek(0)
            csv = reader(fh, dialect=dialect)
        except Error:
            fh.seek(0)
            csv = reader(fh, delimiter="\t")

        # Read labels
        labels = [v.strip() for v in csv.next()]
        self._fieldcount = len(labels)

        # Read values
        for line in csv:
            try:
                if line[0].strip()[0] == '#':
                    # skip comment lines
                    continue
            except IndexError:
                pass
            d = {}
            for i, v in enumerate(line):
                d[labels[i]] = v.decode(encoding).strip()

            self.data.append(d)

示例#19

0

显示文件

    def iter_dataset(self, options):
        fp = options.get("dataset_file")
        filetype = options["filetype"]

        if filetype == "mongo":
            assert options["mongo_collection"]
            db = self.get_mongo_db()
            coll = db[options["mongo_collection"]]

            for l in coll.find():
                del l["_id"]
                yield l

        elif filetype == "json":
            for l in json.load(fp):
                yield l

        elif filetype == "jsonlines":
            for l in fp:
                if l:
                    yield json.loads(l)

        elif filetype == "csv":
            if self.csv_dialect is None:
                dialect = Sniffer().sniff(fp.read(1024 * 16))
                fp.seek(0)
            else:
                dialect = self.csv_dialect

            r = DictReader(fp, dialect=dialect)
            for l in r:
                yield l
        else:
            raise NotImplementedError()

示例#20

0

显示文件

def get_projects(brigade, projects_list_url):
    ''' 
        Get a list of projects from CSV, TSV, or JSON.
        Convert to a dict.
        TODO: Have this work for GDocs.
    '''
    print 'Asking for', projects_list_url
    got = get(projects_list_url)

    # If projects_list_url is a json file
    try:
        projects = [
            dict(brigade=brigade, code_url=item) for item in got.json()
        ]

    # If projects_list_url is a type of csv
    except ValueError:
        data = got.text.splitlines()
        dialect = Sniffer().sniff(data[0])
        projects = list(DictReader(data, dialect=dialect))
        for project in projects:
            project['brigade'] = brigade

    map(update_project_info, projects)

    return projects

示例#21

0

显示文件

def getDelimiter(path):
    sniffer = Sniffer()
    with open(path, 'r') as rfile:
        header = rfile.readline()
        sample = header + rfile.readline() + rfile.readline()
    try:
        asniff = sniffer.sniff(sample, delimiters=";, ")
    except Exception:

        class tsniff(object):
            lineterminator = "\n"
            delimiter = ","

        asniff = tsniff()
        asniff.lineterminator = "\n"
    return asniff.delimiter, sniffer.has_header(sample)

示例#22

0

显示文件

 def __download_as_pandas(self, chunksize, sniff_ahead=2**20):
     """Download and parse data from URL as a table"""
     with self.__tempfile() as tempfile:
         self.url = self.__copyfileobj(tempfile)
         with open(tempfile, mode="rb") as handle:
             magic = handle.read(3)
         if magic == b"\x1f\x8b\x08":
             compression = "gzip"
             from gzip import open as _open
         elif magic == b"\x42\x5a\x68":
             compression = "bz2"
             from bz2 import open as _open
         else:
             compression, _open = "infer", open
         try:
             with _open(tempfile, mode="rt", newline="") as handle:
                 sep = Sniffer().sniff(handle.read(sniff_ahead)).delimiter
             _reader_kw = dict(
                 sep=sep, compression=compression,
                 chunksize=chunksize, **self.pandas_kws,
             )
             for i, csv_chunk in enumerate(read_csv(tempfile, **_reader_kw)):
                 self.INPLACE_process(csv_chunk)
                 msg = f"interpreted table chunk {i}:\n  {tempfile}"
                 GeneFabLogger.info(f"{self.name}; {msg}")
                 yield csv_chunk
         except (IOError, UnicodeDecodeError, CSVError, PandasParserError):
             msg = "Not recognized as a table file"
             raise GeneFabFileException(msg, name=self.name, url=self.url)

示例#23

0

显示文件

文件： csvutil.py 项目： chackenberger/Wahlanalyse

    def read(file):

        with open(file, "r") as csv:

            sniffer = Sniffer()
            sample = csv.read(4096)
            dialect = sniffer.sniff(sample, delimiters=[';', ','])

            csv.seek(0)

            lines_reader = DictReader(csv, dialect=dialect)

            lines = []
            for line in lines_reader:
                lines.append(line)

            return lines, lines_reader.fieldnames

示例#24

0

显示文件

文件： utils.py 项目： aragilar/DiscSolver

 def has_csv_header(self):
     """
     Checks if csv file has header
     """
     self.file.seek(0)
     has_header = Sniffer().has_header(next(iter(self)))
     self.file.seek(0)
     return has_header

示例#25

0

显示文件

文件： utils.py 项目： aragilar/DiscSolver

 def get_dialect(self, *args, lines=5, **kwargs):
     """
     Wrap csv.Sniffer.sniff to handle comments
     """
     self.file.seek(0)
     dialect = Sniffer().sniff(
         [line for line, _ in zip(self, range(lines))], *args, **kwargs)
     self.file.seek(0)
     return dialect

示例#26

0

显示文件

def test_CSVLogger(tmpdir):
    np.random.seed(1337)
    filepath = str(tmpdir / 'log.tsv')
    sep = '\t'
    (X_train, y_train), (X_test, y_test) = get_test_data(num_train=train_samples,
                                                         num_test=test_samples,
                                                         input_shape=(input_dim,),
                                                         classification=True,
                                                         num_classes=num_classes)
    y_test = np_utils.to_categorical(y_test)
    y_train = np_utils.to_categorical(y_train)

    def make_model():
        np.random.seed(1337)
        model = Sequential()
        model.add(Dense(num_hidden, input_dim=input_dim, activation='relu'))
        model.add(Dense(num_classes, activation='softmax'))

        model.compile(loss='categorical_crossentropy',
                      optimizer=optimizers.SGD(lr=0.1),
                      metrics=['accuracy'])
        return model

    # case 1, create new file with defined separator
    model = make_model()
    cbks = [callbacks.CSVLogger(filepath, separator=sep)]
    model.fit(X_train, y_train, batch_size=batch_size,
              validation_data=(X_test, y_test), callbacks=cbks, epochs=1)

    assert os.path.isfile(filepath)
    with open(filepath) as csvfile:
        dialect = Sniffer().sniff(csvfile.read())
    assert dialect.delimiter == sep
    del model
    del cbks

    # case 2, append data to existing file, skip header
    model = make_model()
    cbks = [callbacks.CSVLogger(filepath, separator=sep, append=True)]
    model.fit(X_train, y_train, batch_size=batch_size,
              validation_data=(X_test, y_test), callbacks=cbks, epochs=1)

    # case 3, reuse of CSVLogger object
    model.fit(X_train, y_train, batch_size=batch_size,
              validation_data=(X_test, y_test), callbacks=cbks, epochs=2)

    import re
    with open(filepath) as csvfile:
        list_lines = csvfile.readlines()
        for line in list_lines:
            assert line.count(sep) == 4
        assert len(list_lines) == 5
        output = " ".join(list_lines)
        assert len(re.findall('epoch', output)) == 1

    os.remove(filepath)
    assert not tmpdir.listdir()

示例#27

0

显示文件

    def load(self, filename):
        assert filename is not None, "No datafile for %s!" % self.label
        with filename.open(encoding="utf-8-sig") as handle:
            dialect = Sniffer().sniff(handle.readline(), [',', ';'])
            dialect.doublequote = True
            handle.seek(0)
            with csvw.UnicodeDictReader(handle, dialect=dialect) as reader:
                for i, row in enumerate(reader, 2):  # 2 as row 1 is consumed for header
                    # standardise NA values.
                    try:
                        row = {k: self.fix_value(v) for (k, v) in row.items()}
                    except:
                        print("ERROR PARSING %s" % filename)
                        print("ROW %d: %r" % (i, row))
                        raise
                    
                    
                    if row['word'] == '':  # skip empty rows.
                        continue

                    # merge notes column into comment
                    try:
                        note = row.pop('notes')
                    except KeyError:
                        note = ""

                    if note:
                        if row.get("comment", "") == "":
                            row['comment'] = note
                        else:
                            print('NOTE', filename, row['comment'], note)
                            raise ValueError("Fix %s:%d manually" % (self.label, i))

                    # lose translation
                    try:
                        trans = row.pop('translation')
                    except KeyError:
                        trans = ""

                    if trans:
                        print("Value %s in `translation` will be ignored." % trans)
                        raise ValueError("Fix %s:%d manually" % (self.label, i))
                    
                    yield(row)

示例#28

0

显示文件

文件： work.py 项目： badlovv/avito-analytics-academy

def load_csv(path: str = 'employees.csv') -> List[List]:
    try:
        with open(path) as f:
            dialect = Sniffer().sniff(f.read(1024))
            f.seek(0)
            rdr = reader(f, dialect)
            return list(rdr)
    except FileNotFoundError:
        warn(f'File not found. Please, make sure to place {path} file in the directory!')
        exit(1)

示例#29

0

显示文件

文件： s_csv.py 项目： ms-jpq/sortd

def p_csv(dialect: Optional[str], padding: bool) -> int:
    data = stdin.read()
    joe_biden = Sniffer()
    has_header = joe_biden.has_header(data)

    try:
        if not has_header:
            print(data, end="")
            return 0
        else:
            d = dialect or joe_biden.sniff(data)
            r = _read(data, dialect=d, padding=padding)
            w = writer(stdout, dialect=d)
            w.writerows(r)
    except CSVErr as e:
        log.critical("%s", f"{ERROR}{linesep}{e}")
        return 1
    else:
        return 0

示例#30

0

显示文件

文件： _utils.py 项目： mintproject/simulation-runner

def simulation_matrix(sim_file):
    if not Path(sim_file).exists():
        raise ValueError("Invalid simulation-matrix")

    with open(sim_file) as csvfile:
        dialect = Sniffer().sniff(csvfile.read(4096))
        csvfile.seek(0)
        reader = DictReader(csvfile, dialect=dialect)
        for row in reader:
            yield row

示例#31

0

显示文件

文件： csv_logs.py 项目： felixn/vunit

    def add(self, pattern):
        # pylint: disable=missing-docstring
        for csv_file in [Path(p).resolve() for p in glob(pattern)]:
            with csv_file.open("r", encoding=self._encoding) as fread:
                sample = fread.readline()
                fread.seek(0)
                if sample:
                    dialect = Sniffer().sniff(sample)
                    self._entries += DictReader(fread, fieldnames=self._field_names, dialect=dialect)

        self._entries.sort(key=lambda dictionary: int(dictionary["#"]))

示例#32

0

显示文件

    def _read_file_safe(
        self, some_file: File, ignore_headers: bool
    ) -> Generator[Either[Message, SomeModel], None, None]:
        sniffer = Sniffer()
        try:
            with some_file as csv:
                dialect = sniffer.sniff(csv.read(1024))

                csv.seek(0)

                reader = DictReader(f=csv,
                                    fieldnames=self._fields,
                                    dialect=dialect)

                yield from self._read(reader, ignore_headers)
        except Exception as e:
            message = Message(category=MessageCategory.ERROR,
                              key='import_csv_generic_error',
                              args=[e])

            yield Left([message])

示例#33

0

显示文件

文件： csvdata.py 项目： mritter-tgm/Wahlanalyse

    def read(filename):

        with open(filename, "r") as csvfile:

            sniffer = Sniffer()
            sample = csvfile.read(4096)
            dialect = sniffer.sniff(sample, delimiters=[';', ','])

            if sniffer.has_header(sample):
                # file has header
                pass

            csvfile.seek(0)

            lines_reader = DictReader(csvfile, dialect=dialect)

            lines = []
            for line in lines_reader:
                lines.append(line)

            return lines, lines_reader.fieldnames

示例#34

0

显示文件

    def add(self, pattern):
        for csv_file in [abspath(p) for p in glob(pattern)]:
            with open(csv_file, "r") as f:
                sample = f.readline()
                f.seek(0)
                if len(sample) > 0:
                    dialect = Sniffer().sniff(sample)
                    self._entries += DictReader(f,
                                                fieldnames=self._field_names,
                                                dialect=dialect)

        self._entries.sort(key=lambda dictionary: int(dictionary['#']))

示例#35

0

显示文件

    def read(filename):

        with open(filename, "r") as csvfile:

            sniffer = Sniffer()
            sample = csvfile.read(1024)
            dialect = sniffer.sniff(sample, delimiters=[';', ','])

            if sniffer.has_header(sample):
                # file has header
                pass

            csvfile.seek(0)

            lines_reader = DictReader(csvfile, dialect=dialect)

            lines = []
            for line in lines_reader:
                lines.append(line)

            return lines

示例#36

0

显示文件

文件： run_update.py 项目： maya/cfapi

def get_projects(organization):
    '''
        Get a list of projects from CSV, TSV, JSON, or Github URL.
        Convert to a dict.
        TODO: Have this work for GDocs.
    '''
    _, host, path, _, _, _ = urlparse(organization.projects_list_url)
    matched = match(r'(/orgs)?/(?P<name>[^/]+)/?$', path)

    if host in ('www.github.com', 'github.com') and matched:
        projects_url = 'https://api.github.com/users/%s/repos' % matched.group('name')
    else:
        projects_url = organization.projects_list_url

    logging.info('Asking for ' + projects_url)
    response = get(projects_url)

    try:
        data = get_adjoined_json_lists(response)

    except ValueError:
        # If projects_list_url is a type of csv
        data = response.text.splitlines()
        dialect = Sniffer().sniff(data[0])
        
        #
        # Google Docs CSV output uses double quotes instead of an escape char,
        # but there's not typically a way to know that just from the dialect
        # sniffer. If we see a comma delimiter and no escapechar, then set
        # doublequote to True so that GDocs output doesn't barf.
        #
        if dialect.delimiter == ',' and dialect.doublequote is False and dialect.escapechar is None:
            dialect.doublequote = True
        
        projects = list(DictReader(data, dialect=dialect))
        for project in projects:
            project['organization_name'] = organization.name

    else:
        # If projects_list_url is a json file
        if len(data) and type(data[0]) in (str, unicode):
            # Likely that the JSON data is a simple list of strings
            projects = [dict(organization_name=organization.name, code_url=item)
                        for item in data]

        elif len(data) and type(data[0]) is dict:
            # Map data to name, description, link_url, code_url (skip type, categories)
            projects = [dict(name=p['name'], description=p['description'],
                             link_url=p['homepage'], code_url=p['html_url'],
                             organization_name=organization.name)
                        for p in data]

        elif len(data):
            raise Exception('Unknown type for first project: "%s"' % repr(type(data[0])))

        else:
            projects = []

    map(update_project_info, projects)

    return projects

示例#37

0

显示文件

文件： run_update.py 项目： chrisrodz/cfapi

def get_projects(organization):
    '''
        Get a list of projects from CSV, TSV, JSON, or Github URL.
        Convert to a dict.
        TODO: Have this work for GDocs.
    '''
    _, host, path, _, _, _ = urlparse(organization.projects_list_url)
    matched = match(r'(/orgs)?/(?P<name>[^/]+)/?$', path)

    if host in ('www.github.com', 'github.com') and matched:
        projects_url = 'https://api.github.com/users/%s/repos' % matched.group('name')
        response = get_github_api(projects_url)
    else:
        projects_url = organization.projects_list_url
        logging.info('Asking for ' + projects_url)
        response = get(projects_url)

    try:
        data = get_adjoined_json_lists(response)

    except ValueError:

        # If projects_list_url is a type of csv
        data = response.content.splitlines()

        try:
            dialect = Sniffer().sniff(response.content)

            #
            # Google Docs CSV output uses double quotes instead of an escape char,
            # but there's not typically a way to know that just from the dialect
            # sniffer. If we see a comma delimiter and no escapechar, then set
            # doublequote to True so that GDocs output doesn't barf.
            #
            # Code for Philly's CSV is confusing the sniffer. I suspect its the
            # fields with quoted empty strings.
            # "OpenPhillyGlobe","\"Google Earth for Philadelphia\" with open source
            # and open transit data." ","http://cesium.agi.com/OpenPhillyGlobe/",
            # "https://github.com/AnalyticalGraphicsInc/OpenPhillyGlobe","",""
            #
            if '\\' in response.content:
                dialect.escapechar = '\\'

            # Check for quoted empty strings vs doublequotes
            if ',""' not in response.content and '""' in response.content:
                dialect.doublequote = True

            projects = list(DictReader(data, dialect=dialect))

        except csv.Error:
            projects = list(DictReader(data))

        # Decode everything to unicode objects.
        for (index, proj) in enumerate(projects):
            projects[index] = dict([(k.decode('utf8'), v.decode('utf8'))
                                         for (k, v) in proj.items()])

        # Add organization names along the way.
        for project in projects:
            project['organization_name'] = organization.name

    else:
        # Fail silently when the github url is no valid
        if type(data) != list and data['message'] == u'Not Found':
            return []

        # If projects_list_url is a json file
        if len(data) and type(data[0]) in (str, unicode):
            # Likely that the JSON data is a simple list of strings
            projects = [dict(organization_name=organization.name, code_url=item)
                        for item in data]

        elif len(data) and type(data[0]) is dict:
            # Map data to name, description, link_url, code_url (skip type, categories)
            # all keys don't always exist
            projects = []
            for project in data:
                new_project = {}
                new_project['organization_name'] = organization.name
                if "name" in project:
                    new_project["name"] = project["name"]
                if "description" in project:
                    new_project["description"] = project["description"]
                if "homepage" in project:
                    new_project["link_url"] = project["homepage"]
                if "html_url" in project:
                    new_project["code_url"] = project["html_url"]
                projects.append(new_project)

        elif len(data):
            raise Exception('Unknown type for first project: "%s"' % repr(type(data[0])))

        else:
            projects = []

    projects = [update_project_info(proj) for proj in projects]

    # Filter out projects that have not been updated
    new_projects = []
    for proj in projects:
        if proj is not None:
            new_projects.append(proj)

    return new_projects