def get_fields(cls, file, encoding): sample_data = defaultdict(OrderedSet) for f, enc, _ in UploadScript._get_files(file, encoding): csvf = _open(f, encoding) reader = csv.DictReader(csvf) for row in itertools.islice(reader, 0, 5): for field_name, value in row.items(): if value.strip(): sample_data[field_name].add(value.strip()) # Delete empty data for values in sample_data.values(): if "" in values: values.remove("") # Guess types and destinations for field_name, values in sorted(sample_data.items(), key=itemgetter(0)): filtered_field_name = to_valid_field_name(field_name) try: value = next(iter(values)) except StopIteration: value = None suggested_destination, suggested_type = guess_destination_and_type( filtered_field_name, value) yield ArticleField(field_name, destination=suggested_destination, values=list( itertools.islice(sample_data[field_name], 0, 5)), suggested_type=suggested_type)
def parse_file(self, file, encoding, _data): reader = csv.DictReader(_open(file, encoding)) for unmapped_dict in reader: art_dict = self.map_article(unmapped_dict) properties = {} for k, v in art_dict.items(): v = parse_value(k, v) properties[k] = v yield Article.fromdict(properties)
def get_fields(cls, file: str, encoding: str): fields = OrderedDict() fieldMap = Language.reverseMap(cls.languages) for file, encoding, _ in cls._get_files(file, encoding): reader = csv.DictReader(_open(file, encoding), delimiter=";") rows = [row for row in reader] fields.update((k, (fieldMap[k], [row[k] for row in rows])) for k in reader.fieldnames) for source, (destination, values) in fields.items(): dest_name = ESFIELDS[destination] yield ArticleField(source, destination=dest_name, values=values)
def parse_file(self, file: str, encoding: str, _: None): self.queries = set() rows = csv.DictReader(_open(file, encoding), delimiter=";") self.lang = self._get_language(rows) yield from (self._scrape_unit(row) for row in rows)
def setUp(self): self.test_dir = os.path.join(os.path.dirname(__file__), 'test_files', 'defacto') self.test1 = os.path.join(self.test_dir, 'DeFacto-Campus - Ausdruck1.htm') self.test1_html = get_html(_open(self.test1, "autodetect")) self.test2 = os.path.join(self.test_dir, 'DeFacto-Campus - Ausdruck2.htm') self.test2_html = get_html(_open(self.test2, "autodetect"))