def get_fields(cls, file, encoding): values = defaultdict(list) for art_dict in cls._scrape_unit(_read(file, encoding)): for k, v in art_dict.items(): values[k].append(v) for k, v in values.items(): yield ArticleField(k, k, v[:5])
def parse_field(file, type, value): if type == 'literal': return value if value == 'filename': return filename if value == 'text': return _read(file, encoding) if value.startswith('filename-'): n = int(value.split("-")[-1]) return filename.split("_")[n - 1] # filename-n is 1 based index raise ValueError("Can't parse field {value}".format(**locals()))
def get_fields(cls, file: str, encoding: str): path, fn = os.path.split(file) fn, ext = os.path.splitext(fn) yield ArticleField("Filename", "title", values=[fn]) # FIXME encoding, and probably don't read the whole file? yield ArticleField("Text", "text", values=[_read(file, encoding=encoding, n=100)]) if path: yield ArticleField("Path", "section", values=[path]) if "_" in fn: for i, elem in enumerate(fn.split("_")): yield ArticleField("Filename part {i}".format(**locals()), values=[elem])
def parse_field(file, type, value): if type == 'literal': return value if value == 'Filename': return filename if value == 'Text': return _read(file, encoding) if value == 'Path': return path if value.startswith('Filename part '): n = int(value.replace("Filename part ", "")) return filename.split("_")[n - 1] # filename-n is 1 based index raise ValueError("Can't parse field {value}".format(**locals()))
def split_file(file, encoding): # Parses HTML file (bytes) to a more convienient lxml representation document = html.fromstring(_read(file, encoding)) # Selects all elements with class=article return document.cssselect(".article")
def _preprocess(cls, file, encoding): text = _read(file, encoding) query, fragments = split_file(text) arts = (parse_article(doc) for doc in fragments) arts = [art for art in arts if art] return query, arts
def parse_file(self, file, encoding, _data): for art_dict in self._scrape_unit(_read(file, encoding)): yield Article.fromdict(self.map_article(art_dict))