Пример #1
0
 def get_fields(cls, file, encoding):
     values = defaultdict(list)
     for art_dict in cls._scrape_unit(_read(file, encoding)):
         for k, v in art_dict.items():
             values[k].append(v)
     for k, v in values.items():
         yield ArticleField(k, k, v[:5])
Пример #2
0
 def parse_field(file, type, value):
     if type == 'literal':
         return value
     if value == 'filename':
         return filename
     if value == 'text':
         return _read(file, encoding)
     if value.startswith('filename-'):
         n = int(value.split("-")[-1])
         return filename.split("_")[n -
                                    1]  # filename-n is 1 based index
     raise ValueError("Can't parse field {value}".format(**locals()))
Пример #3
0
 def get_fields(cls, file: str, encoding: str):
     path, fn = os.path.split(file)
     fn, ext = os.path.splitext(fn)
     yield ArticleField("Filename", "title", values=[fn])
     # FIXME encoding, and probably don't read the whole file?
     yield ArticleField("Text",
                        "text",
                        values=[_read(file, encoding=encoding, n=100)])
     if path:
         yield ArticleField("Path", "section", values=[path])
     if "_" in fn:
         for i, elem in enumerate(fn.split("_")):
             yield ArticleField("Filename part {i}".format(**locals()),
                                values=[elem])
Пример #4
0
 def parse_field(file, type, value):
     if type == 'literal':
         return value
     if value == 'Filename':
         return filename
     if value == 'Text':
         return _read(file, encoding)
     if value == 'Path':
         return path
     if value.startswith('Filename part '):
         n = int(value.replace("Filename part ", ""))
         return filename.split("_")[n -
                                    1]  # filename-n is 1 based index
     raise ValueError("Can't parse field {value}".format(**locals()))
Пример #5
0
def split_file(file, encoding):
    # Parses HTML file (bytes) to a more convienient lxml representation
    document = html.fromstring(_read(file, encoding))

    # Selects all elements with class=article
    return document.cssselect(".article")
Пример #6
0
 def _preprocess(cls, file, encoding):
     text = _read(file, encoding)
     query, fragments = split_file(text)
     arts = (parse_article(doc) for doc in fragments)
     arts = [art for art in arts if art]
     return query, arts
Пример #7
0
 def parse_file(self, file, encoding, _data):
     for art_dict in self._scrape_unit(_read(file, encoding)):
         yield Article.fromdict(self.map_article(art_dict))