Пример #1
0
 def from_file(self, path):
     self.file_path = path
     if not path:
         self.word_list = []
     else:
         enc = detect_encoding(path)
         with open(path, encoding=enc) as f:
             self.word_list = set([line.strip() for line in f])
Пример #2
0
    def from_file(path):
        if not path:
            return set()

        for encoding in ('utf-8', None, detect_encoding(path)):
            try:
                with open(path, encoding=encoding) as f:
                    return set(line.strip() for line in f)
            except UnicodeDecodeError:
                continue
        # No encoding worked, raise
        raise UnicodeError("Couldn't determine file encoding")
Пример #3
0
    def from_file(self, path):
        self.file_path = path
        self.word_list = []
        if not path:
            return

        for encoding in ('utf-8',
                         None,  # sys.getdefaultencoding()
                         detect_encoding(path)):
            try:
                with open(path, encoding=encoding) as f:
                    self.word_list = set(line.strip() for line in f)
            except UnicodeDecodeError:
                continue
            return
        # No encoding worked, raise
        raise UnicodeError("Couldn't determine file encoding")
Пример #4
0
    def from_file(self, path):
        self.file_path = path
        self.word_list = []
        if not path:
            return

        for encoding in ('utf-8',
                         None,  # sys.getdefaultencoding()
                         detect_encoding(path)):
            try:
                with open(path, encoding=encoding) as f:
                    self.word_list = set(line.strip() for line in f)
            except UnicodeDecodeError:
                continue
            return
        # No encoding worked, raise
        raise UnicodeError("Couldn't determine file encoding")
Пример #5
0
    def from_file(cls, filename):
        """
        Load distance matrix from a file

        The file should be preferrably encoded in ascii/utf-8. White space at
        the beginning and end of lines is ignored.

        The first line of the file starts with the matrix dimension. It
        can be followed by a list flags

        - *axis=<number>*: the axis number
        - *symmetric*: the matrix is symmetric; when reading the element (i, j)
          it's value is also assigned to (j, i)
        - *asymmetric*: the matrix is asymmetric
        - *row_labels*: the file contains row labels
        - *col_labels*: the file contains column labels

        By default, matrices are symmetric, have axis 1 and no labels are given.
        Flags *labeled* and *labelled* are obsolete aliases for *row_labels*.

        If the file has column labels, they follow in the second line.
        Row labels appear at the beginning of each row.
        Labels are arbitrary strings that connot contain newlines and
        tabulators. Labels are stored as instances of `Table` with a single
        meta attribute named "label".

        The remaining lines contain tab-separated numbers, preceded with labels,
        if present. Lines are padded with zeros if necessary. If the matrix is
        symmetric, the file contains the lower triangle; any data above the
        diagonal is ignored.

        Args:
            filename: file name
        """
        with open(filename, encoding=detect_encoding(filename)) as fle:
            line = fle.readline()
            if not line:
                raise ValueError("empty file")
            data = line.strip().split()
            if not data[0].strip().isdigit():
                raise ValueError("distance file must begin with dimension")
            n = int(data.pop(0))
            symmetric = True
            axis = 1
            col_labels = row_labels = None
            for flag in data:
                if flag in ("labelled", "labeled", "row_labels"):
                    row_labels = []
                elif flag == "col_labels":
                    col_labels = []
                elif flag == "symmetric":
                    symmetric = True
                elif flag == "asymmetric":
                    symmetric = False
                else:
                    flag_data = flag.split("=")
                    if len(flag_data) == 2:
                        name, value = map(str.strip, flag_data)
                    else:
                        name, value = "", None
                    if name == "axis" and value.isdigit():
                        axis = int(value)
                    else:
                        raise ValueError("invalid flag '{}'".format(
                            flag, filename))
            if col_labels is not None:
                col_labels = [x.strip()
                              for x in fle.readline().strip().split("\t")]
                if len(col_labels) != n:
                    raise ValueError("mismatching number of column labels")

            matrix = np.zeros((n, n))
            for i, line in enumerate(fle):
                if i >= n:
                    raise ValueError("too many rows".format(filename))
                line = line.strip().split("\t")
                if row_labels is not None:
                    row_labels.append(line.pop(0).strip())
                if len(line) > n:
                    raise ValueError("too many columns in matrix row {}".
                                     format("'{}'".format(row_labels[i])
                                            if row_labels else i + 1))
                for j, e in enumerate(line[:i + 1 if symmetric else n]):
                    try:
                        matrix[i, j] = float(e)
                    except ValueError as exc:
                        raise ValueError(
                            "invalid element at row {}, column {}".format(
                                "'{}'".format(row_labels[i])
                                if row_labels else i + 1,
                                "'{}'".format(col_labels[j])
                                if col_labels else j + 1)) from exc
                    if symmetric:
                        matrix[j, i] = matrix[i, j]
        if col_labels:
            col_labels = Table.from_list(
                Domain([], metas=[StringVariable("label")]),
                [[item] for item in col_labels])
        if row_labels:
            row_labels = Table.from_list(
                Domain([], metas=[StringVariable("label")]),
                [[item] for item in row_labels])
        return cls(matrix, row_labels, col_labels, axis)
Пример #6
0
 def read_file(self):
     encoding = detect_encoding(self.path)
     with open(self.path, 'r', encoding=encoding) as f:
         self.content = f.read()
Пример #7
0
 def read_file(self):
     encoding = detect_encoding(self.path)
     with open(self.path, encoding=encoding, errors='ignore') as markup:
         soup = BeautifulSoup(markup.read(), "lxml")
     self.content = soup.get_text()
Пример #8
0
 def read_file(self):
     encoding = detect_encoding(self.path)
     with open(self.path, 'r', encoding=encoding) as f:
         self.content = f.read()
Пример #9
0
 def read_file(self):
     encoding = detect_encoding(self.path)
     with open(self.path, encoding=encoding, errors='ignore') as markup:
         soup = BeautifulSoup(markup.read(), "lxml")
     self.content = soup.get_text()