Пример #1
0
class DownloadLinkFetcher:

    RETRY = 5

    def __init__(self, config):
        self.base_api_url = config.base_api_url

        self.start_date = config.start_date
        self.current_date = config.start_date
        self.end_date = config.end_date
        self.step_unit = config.step_unit
        self.step = config.step

        self.html_fetcher = NetworkFetcher()

    def _format_link(self, link):
        hash_index = link.find('#')
        if hash_index != -1:
            link = link[:hash_index]
        if link[-1] == '/':
            link = link[:-1]
        return link

    def _link_filter(self, link, filters):
        if not link[-1].isdigit():
            return False
        for filter_ in filters:
            if link[filter_[1]:filter_[2]] == filter_[0]:
                return False
        return True

    def _html_to_links(self, html):
        return []

    def _next_api(self, base_url, current_date):
        return ''

    def next(self):
        if self.current_date >= self.end_date:
            return None, None
        api_url = self._next_api(self.base_api_url, self.current_date)
        date = self.current_date
        self.current_date += self.step
        return api_url, date

    def fetch(self, api_url):
        print('fetching download links...')
        html = self.html_fetcher.fetch(api_url)
        if html is None:
            for _ in range(0, self.RETRY):
                html = self.html_fetcher.fetch(api_url)
                if html is not None:
                    break
        if html is None or len(html) == 0:
            print('api', api_url, ' failed')
            return []
        links = self._html_to_links(html)
        return links
Пример #2
0
    def __init__(self, config):
        self.config = config
        self.download_link_fetcher = None
        self.html_fetcher = NetworkFetcher()
        self.path = config.path

        self.total_date = 0

        self._mkdir(self.path, config.start_date, config.end_date, config.step)
Пример #3
0
    def __init__(self, config):
        self.base_api_url = config.base_api_url

        self.start_date = config.start_date
        self.current_date = config.start_date
        self.end_date = config.end_date
        self.step_unit = config.step_unit
        self.step = config.step

        self.html_fetcher = NetworkFetcher()
Пример #4
0
    def __init__(self, config):
        self.config = config
        self.download_link_fetcher = None
        self.html_fetcher = NetworkFetcher()
        self.path = config.path

        self.total_date = 0

        config.start_date_ = config.start_date.replace(day=1)
        if config.end_date.day > 1:
            config.end_date_ = config.end_date.replace(day=1)
            config.end_date_ += relativedelta(months=1)
        else:
            config.end_date_ = config.end_date

        self._mkdir(self.path, config.start_date_, config.end_date_,
                    config.step)

        self.download_link_fetcher = NytimesLinkFetcher(config)
Пример #5
0
class ArticleFetcher:

    RETRY = 5

    def __init__(self, config):
        self.config = config
        self.download_link_fetcher = None
        self.html_fetcher = NetworkFetcher()
        self.path = config.path

        self.total_date = 0

        self._mkdir(self.path, config.start_date, config.end_date, config.step)

    def _mkdir(self, path, start_date, end_date, step):
        if os.path.isdir(path):
            # current_date = start_date
            # while current_date < end_date:
            #     current_date += step
            #     self.total_date += 1
            # return
            pass
        else:
            os.makedirs(path)
        current_date = start_date
        existed_years = dict()
        while current_date < end_date:
            year = current_date.year
            month = current_date.month
            day = current_date.day

            year_path = os.path.join(path, str(year))
            month_path = os.path.join(year_path, str(month))
            day_path = os.path.join(month_path, str(day))

            if year not in existed_years.keys():
                existed_years[year] = dict()
                if not os.path.isdir(year_path):
                    os.mkdir(year_path)

            if (step.months > 0) or (step.days > 0):
                year_content = existed_years[year]
                if month not in year_content.keys():
                    year_content[month] = True
                    if not os.path.isdir(month_path):
                        os.mkdir(month_path)

            if step.days > 0:
                if not os.path.isdir(day_path):
                    os.mkdir(day_path)
            current_date += step

            self.total_date += 1

    def _html_to_infomation(self, html, link, date):
        return {}

    def _extract_information(self, link, date):
        html = self.html_fetcher.fetch(link)
        if html is None:
            for _ in range(0, self.RETRY):
                html = self.html_fetcher.fetch(link)
                if html is not None:
                    break
        if html is None:
            print('article ', link, 'failed')
            return None
        return self._html_to_infomation(html, link, date)

    def _get_storage_path(self, path, date):
        return os.path.join(path, str(date.year), str(date.month),
                            str(date.day))

    def _lazy_storage(self, storage_path, links, date):
        total_links = len(links)
        current_link = 1
        current_id = 1
        data = ET.Element('DATA')

        for link in links:
            print('>>> {c} in {t} articles\r'.format(c=current_link,
                                                     t=total_links),
                  end='')
            current_link += 1

            article = self._extract_information(link, date)

            if article is not None:
                article['ID'] = str(
                    article['ID']) + str(date) + '-' + str(current_id)
                #print(str(article['published_date']))

                docs = ET.SubElement(data, 'DOC')
                ID = ET.SubElement(docs, 'ID')
                theTitle = ET.SubElement(docs, 'TITLE')
                auther = ET.SubElement(docs, 'AUTHER')
                timedate = ET.SubElement(docs, 'DATE')
                topic = ET.SubElement(docs, 'TOPIC')
                image = ET.SubElement(docs, 'IMAGE')
                content = ET.SubElement(docs, 'TEXT')
                thelink = ET.SubElement(docs, 'URL')

                ID.text = str(article['ID'])
                theTitle.text = str(article['title'])
                auther.text = str(article['authors'])
                timedate.text = str(article['published_date'])
                topic.text = str(article['section'])
                image.text = str(article['image'])
                content.text = str(article['content'])
                thelink.text = str(article['link'])
                current_id += 1

        Prettyxml = minidom.parseString(
            ET.tostring(data)).toprettyxml(indent="   ")
        articles_path = os.path.join(storage_path, 'Articles.xml')
        with open(articles_path, mode='w', encoding='utf-8') as articles_file:
            articles_file.write(Prettyxml)

    def fetch(self, lazy_storage=True):
        current_date = 1
        while True:
            api_url, date = self.download_link_fetcher.next()
            #print(api_url)
            if api_url is None:
                break
            print(
                date.strftime('%Y-%m-%d'),
                '{c} in {t} dates                  '.format(c=current_date,
                                                            t=self.total_date))

            storage_path = self._get_storage_path(self.path, date)
            links = self.download_link_fetcher.fetch(api_url)
            self._lazy_storage(storage_path, links, date)

            time.sleep(self.config.sleep)

            print(date.strftime('%Y-%m-%d'),
                  'date {c} finished                 '.format(c=current_date))
            current_date += 1
Пример #6
0
class ArticleFetcher:

    RETRY = 5

    def __init__(self, config):
        self.config = config
        self.download_link_fetcher = None
        self.html_fetcher = NetworkFetcher()
        self.path = config.path

        self.total_date = 0

        self._mkdir(self.path,
                    config.start_date,
                    config.end_date,
                    config.step)

    def _mkdir(self, path, start_date, end_date, step):
        if os.path.isdir(path):
            # current_date = start_date
            # while current_date < end_date:
            #     current_date += step
            #     self.total_date += 1
            # return
            pass
        else:
            os.makedirs(path)
        current_date = start_date
        existed_years = dict()
        while current_date < end_date:
            year = current_date.year
            month = current_date.month
            day = current_date.day

            year_path = os.path.join(path, str(year))
            month_path = os.path.join(year_path, str(month))
            day_path = os.path.join(month_path, str(day))

            if year not in existed_years.keys():
                existed_years[year] = dict()
                if not os.path.isdir(year_path):
                    os.mkdir(year_path)

            if (step.months > 0) or (step.days > 0):
                year_content = existed_years[year]
                if month not in year_content.keys():
                    year_content[month] = True
                    if not os.path.isdir(month_path):
                        os.mkdir(month_path)

            if step.days > 0:
                if not os.path.isdir(day_path):
                    os.mkdir(day_path)
            current_date += step

            self.total_date += 1

    def _html_to_infomation(self, html, link, date):
        return {}

    def _extract_information(self, link, date):
        html = self.html_fetcher.fetch(link)
        if html is None:
            for _ in range(0, self.RETRY):
                html = self.html_fetcher.fetch(link)
                if html is not None:
                    break
        if html is None:
            print('article ', link, 'failed')
            return None
        return self._html_to_infomation(html, link, date)

    def _get_storage_path(self, path, date):
        return os.path.join(path, str(date.year), str(date.month), str(date.day))

    def _lazy_storage(self, storage_path, links, date):
        total_links = len(links)
        current_link = 1

        titles_path = os.path.join(storage_path, 'titles')
        with open(titles_path, mode='w', encoding='utf-8') as titles_file:
            articles = list()
            titles = list()
            for link in links:
                print('>>> {c} in {t} articles\r'.format(c=current_link, t=total_links), end='')
                current_link += 1

                article = self._extract_information(link, date)
                if article is not None:
                    titles.append(article['title'] + '\n')
                    articles.append(article)

            articles_path = os.path.join(storage_path, 'articles')
            with open(articles_path, mode='w', encoding='utf-8') as articles_file:
                json.dump({
                    'expected_number': len(links),
                    'number': len(articles),
                    'articles': articles
                }, articles_file, indent=4)
            titles_file.writelines(titles)

    def _non_lazy_storage(self, storage_path, links, date):
        total_links = len(links)
        current_link = 1

        titles_path = os.path.join(storage_path, 'titles')
        with open(titles_path, mode='w', encoding='utf-8') as titles_file:
            for article_index, link in enumerate(links):
                print('{c} in {t} articles\r'.format(c=current_link, t=total_links), end='')
                current_link += 1

                article = self._extract_information(link, date)
                if article is not None:
                    titles_file.write(article['title'] + '\n')

                    article_path = os.path.join(storage_path, str(article_index))
                    with open(article_path, mode='w', encoding='utf-8') as article_file:
                        json.dump(article, article_file, indent=4)

    def fetch(self, lazy_storage=True):
        current_date = 1
        while True:
            api_url, date = self.download_link_fetcher.next()
            if api_url is None:
                break
            print(date.strftime('%Y-%m-%d'),
                  '{c} in {t} dates                  '.format(c=current_date, t=self.total_date))

            storage_path = self._get_storage_path(self.path, date)
            links = self.download_link_fetcher.fetch(api_url)
            if lazy_storage:
                self._lazy_storage(storage_path, links, date)
            else:
                self._non_lazy_storage(storage_path, links, date)

            time.sleep(self.config.sleep)

            print(date.strftime('%Y-%m-%d'),
                  'date {c} finished                 '.format(c=current_date))
            current_date += 1