Python ExplorerArticle примеры использования

Язык программирования: Python

Пространство имен/Пакет: ExplorerArticle

Класс/Тип: ExplorerArticle

Примеров на hotexamples.com: 6

Python ExplorerArticle - 6 примеров найдено. Это лучшие примеры Python кода для ExplorerArticle.ExplorerArticle, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

preliminary_parse(3)

newspaper_parse(3)

evaluate_css_selectors(2)

ExplorerArticle(1)

download(1)

get_links(1)

get_urls(1)

Пример #1

Показать файл

Файл: Crawler.py Проект: zhouwein/Voyage

        def next(self):
            '''
            (Crawler) -> newspaper.Article
            returns the next article in the sequence
            '''
            #standard non-recursive tree iteration
            while(True):
                if(len(self.visit_queue) <= 0):
                    raise StopIteration
                current_url = self.visit_queue.pop()

                if(self._should_skip()):
                    logging.info(u"skipping {0} randomly".format(current_url))
                    continue

                logging.info(u"visiting {0}".format(current_url))
                #use newspaper to download and parse the article
                article = ExplorerArticle(current_url)
                article.download()

                # get get urls from the article
                for link in article.get_links():
                    url = urljoin(current_url, link.href, False)
                    if self.url_in_filter(url, self.filters):
                        logging.info("Matches with filter, skipping the {0}".format(url))
                        continue
                    try:
                        parsed_url = urlparse(url)
                        parsed_as_list = list(parsed_url)
                        if parsed_url.scheme != u"http" and parsed_url.scheme != u"https":
                            logging.info(u"skipping url with invalid scheme: {0}".format(url))
                            continue
                        parsed_as_list[5] = ''
                        url = urlunparse(urlnorm.norm_tuple(*parsed_as_list))
                    except Exception as e:
                        logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e)))
                        continue
                    if not parsed_url.netloc.endswith(self.domain):
                        continue
                    if url in self.visited_urls:
                        continue
                    self.visit_queue.appendleft(url)
                    self.visited_urls.add(url)
                    logging.info(u"added {0} to the visit queue".format(url))

                self.pages_visited += 1
                return article

Пример #2

Показать файл

Файл: article_explorer.py Проект: zhouwein/Voyage

def parse_articles_per_site(db_keywords, source_sites, twitter_accounts_explorer, site):

    logging.info("Started multiprocessing of Site: %s", site.name)
    #Setup logging for this site
    setup_logging(site.name)

    article_count = 0
    newspaper_articles = []
    crawlersource_articles = []
    logging.info("Site: %s Type:%i"%(site.name, site.mode))
    #0 = newspaper, 1 = crawler, 2 = both

    if(site.mode == 0 or site.mode == 2):
        logging.disable(logging.ERROR)
        newspaper_source = newspaper.build(site.url,
                                         memoize_articles=False,
                                         keep_article_html=True,
                                         fetch_images=False,
                                         number_threads=1)
        logging.disable(logging.NOTSET)
        newspaper_articles = newspaper_source.articles
        article_count += newspaper_source.size()
        logging.info("populated {0} articles using newspaper".format(article_count))
    if(site.mode == 1 or site.mode == 2):
        crawlersource_articles = Crawler.Crawler(site)
        article_count += crawlersource_articles.probabilistic_n
        logging.debug("expecting {0} from plan b crawler".format(crawlersource_articles.probabilistic_n))
    article_iterator = itertools.chain(iter(newspaper_articles), crawlersource_articles).__iter__()
    processed = 0

    filters = site.referringsitefilter_set.all()
    while True:
        try:
            try:
                article = article_iterator.next()
            except StopIteration:
                break
            #have to put all the iteration stuff at the top because I used continue extensively in this loop
            processed += 1

            if url_in_filter(article.url, filters):
                logging.info("Matches with filter, skipping the {0}".format(article.url))
                continue

            print(
                "%s (Article|%s) %i/%i          \r" %
                (str(timezone.localtime(timezone.now()))[:-13],
                 site.name, processed, article_count))
            logging.info("Processing %s"%article.url)

            url = article.url
            if 'http://www.' in url:
                url = url[:7] + url[11:]
            elif 'https://www.' in url:
                url = url[:8] + url[12:]
            article = ExplorerArticle(article.url)
            logging.debug("ExplorerArticle Created")
            # Try to download and extract the useful data
            if(not article.is_downloaded):
                if(not article.download()):
                    logging.warning("article skipped because download failed")
                    continue
            url = article.canonical_url

            if (not article.is_parsed):
                if (not article.preliminary_parse()):
                    logging.warning("article skipped because parse failed")
                    continue

            logging.debug("Article Parsed")
            
            logging.debug(u"Title: {0}".format(repr(article.title)))
            if not article.title:
                logging.info("article missing title, skipping")
                continue

            if not article.text:
                logging.info("article missing text, skipping")
                continue

            # Regex the keyword from the article's text
            keywords = get_keywords(article, db_keywords)
            logging.debug(u"matched keywords: {0}".format(repr(keywords)))
            # Regex the links within article's html
            sources = get_sources_sites(article, source_sites)
            logging.debug(u"matched sources: {0}".format(repr(sources)))
            twitter_accounts = get_sources_twitter(article, twitter_accounts_explorer)
            logging.debug(u"matched twitter_accounts: {0}".format(repr(twitter_accounts[0])))

            if((not keywords) and (not sources[0]) and (not twitter_accounts[0])):#[] gets coverted to false
                logging.debug("skipping article because it's not a match")
                continue

            article.newspaper_parse()
            text = article._newspaper_text
            # Rerun the get_keywords with text parsed by newspaper.
            keywords = get_keywords(article, db_keywords)

            if((not keywords) and (not sources[0]) and (not twitter_accounts[0])):#[] gets coverted to false
                logging.debug("skipping article because it's not a match")
                continue
                
            logging.info("match found")

            #load selectors from db!
            #parameter is a namedtuple of "css" and "regex"
            title = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=0)) or article.title
            authors = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=1))
            if(authors):
                authors = [authors]
            else:
                authors = article.authors
            pub_date = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=2))
            if(pub_date):
                pub_date = dateutil.parser.parse(pub_date)
            else:
                pub_date = get_pub_date(article)
            mod_date = article.evaluate_css_selectors(site.referringsitecssselector_set.filter(field=3))

            language = article.language

            date_now=timezone.localtime(timezone.now())

            # Check if the entry already exists
            db_article_list = Article.objects.filter(url=url)
            if not db_article_list:
                logging.info("Adding new Article to the DB")
                # If the db_article is new to the database,
                # add it to the database
                db_article = Article(title=title, url=url,
                                  domain=site.url,
                                  date_added=date_now,
                                  date_last_seen=date_now,
                                  date_published=pub_date,
                                  date_modified=mod_date,
                                  language=language,
                                  text=text)
                db_article.save()

                db_article = Article.objects.get(url=url)

                for key in keywords:
                    db_article.keyword_set.create(name=key)

                for author in authors:
                    db_article.author_set.create(name=author)
                for account in twitter_accounts[0]:

                    db_article.sourcetwitter_set.create(name = account, matched = True)

                for account in twitter_accounts[1]:
                    db_article.sourcetwitter_set.create(name = account, matched = False)

                for source in sources[0]:
                    db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], anchor_text=source[2],
                                              matched=True, local=(source[1] in site.url))

                for source in sources[1]:
                    db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], anchor_text=source[2],
                                              matched=False, local=(source[1] in site.url))

            else:
                logging.info("Modifying existing Article in the DB")
                # If the db_article already exists,
                # update all fields except date_added
                db_article = db_article_list[0]
                db_article.title = title
                db_article.url = url
                db_article.domain = site.url
                # Do not update the added date
                # db_article.date_added = today
                db_article.date_last_seen = date_now
                db_article.date_published = pub_date
                db_article.date_modified = mod_date
                db_article.language = language
                db_article.text = text
                db_article.save()

                for key in keywords:
                    if not db_article.keyword_set.filter(name=key):
                        db_article.keyword_set.create(name=key)

                for author in authors:
                    if not db_article.author_set.filter(name=author):
                        db_article.author_set.create(name=author)

                for account in twitter_accounts[0]:
                    if not db_article.sourcetwitter_set.filter(name=account):
                        db_article.sourcetwitter_set.create(name = account, matched = True)

                for account in twitter_accounts[1]:
                    if not db_article.sourcetwitter_set.filter(name=account):
                        db_article.sourcetwitter_set.create(name = account, matched = False)

                for source in sources[0]:
                    if not db_article.sourcesite_set.filter(url=source[0]):
                        db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], anchor_text=source[2],
                                              matched=True, local=(source[1] in site.url))

                for source in sources[1]:
                    if not db_article.sourcesite_set.filter(url=source[0]):
                        db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], anchor_text=source[2],
                                              matched=False, local=(source[1] in site.url))

            warc_creator.enqueue_article(url)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            logging.exception("Unhandled exception while crawling: " + str(e))

    logging.info("Finished Site: %s"%site.name)
    setup_logging(increment=False)
    logging.info("Finished Site: %s"%site.name)

Пример #3

Показать файл

Файл: Crawler.py Проект: danhuacai/Voyage

    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt',
                  'a') as ignore_filter_file:
            try:
                current_level = 0
                while (True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(
                            self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(
                                current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True  # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                            initial_capacity=10000000, error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name +
                                  '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError

                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(
                                u"skipping url \"{0}\" because it matches filter"
                                .format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if (parsed_url.scheme != u"http"
                                    and parsed_url.scheme != u"https"):
                                logging.info(
                                    u"skipping url with invalid scheme: {0}".
                                    format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(
                                urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(
                                u"skipping malformed url {0}. Error: {1}".
                                format(url, str(e)))
                            continue
                        if (not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and
                                not (u"-subscribe" in url or "-subscribe"
                                     or u"subscribe-" in url or "subscribe-")):
                            continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put(
                                (url, str(int(current_level) + 1)))
                            logging.info(
                                u"added {0} to the to_visit as well as the level {1}"
                                .format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(
                                u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()

                    return article

            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e

Пример #4

Показать файл

Файл: article_explorer.py Проект: wangx173/Voyage

def parse_articles(referring_sites, db_keywords, source_sites, twitter_accounts_explorer):
    """ (list of [str, newspaper.source.Source, str],
         list of str, list of str, str) -> None
    Downloads each db_article in the site, extracts, compares
    with Foreign Sites and Keywords provided.
    Then the db_article which had a match will be stored into the Django database

    Keyword arguments:
    referring_sites     -- List of [name, 'built_article'] of each site
    db_keywords         -- List of keywords
    source_sites       -- List of foreign sites
    """
    added, updated, failed, no_match = 0, 0, 0, 0

    # for each db_article in each sites, download and parse important data
    for site in referring_sites:
        # print "\n%s" % site[0]

        article_count = 0
        newspaper_articles = []
        crawlersource_articles = []
        logging.info("Site: %s Type:%i"%(site['name'], site['type']))
        #0 = newspaper, 1 = crawler, 2 = both

        if(site["type"] == 0 or site["type"] == 2):
            logging.disable(logging.ERROR)
            newspaper_source = newspaper.build(site["url"],
                                             memoize_articles=False,
                                             keep_article_html=True,
                                             fetch_images=False,
                                             language='en',
                                             number_threads=1)
            logging.disable(logging.NOTSET)
            newspaper_articles = newspaper_source.articles
            article_count += newspaper_source.size()
            logging.info("populated {0} articles using newspaper".format(article_count))
        if(site["type"] == 1 or site["type"] == 2):
            crawlersource_articles = Crawler.Crawler(site["url"], site["filter"])
            article_count += crawlersource_articles.probabilistic_n
            logging.debug("expecting {0} from plan b crawler".format(crawlersource_articles.probabilistic_n))
        article_iterator = itertools.chain(iter(newspaper_articles), crawlersource_articles)
        processed = 0
        for article in article_iterator:
            #have to put all the iteration stuff at the top because I used continue extensively in this loop
            processed += 1
            # Check for any new command on communication stream
            check_command()        

            if url_in_filter(article.url, site["filter"]):
                logging.info("Matches with filter, skipping the {0}".format(article.url))
                continue

            print(
                "%s (Article|%s) %i/%i          \r" %
                (str(timezone.localtime(timezone.now()))[:-13],
                 site["name"], processed, article_count))
            logging.info("Processing %s"%article.url)

            url = article.url
            if 'http://www.' in url:
                url = url[:7] + url[11:]
            elif 'https://www.' in url:
                url = url[:8] + url[12:]

            article = ExplorerArticle(article.url)
            # Try to download and extract the useful data
            if(not article.is_downloaded):
                if(not article.download()):
                    logging.warning("article skipped because download failed")
                    continue

            article.preliminary_parse()

            if not article.title:
                logging.info("article missing title, skipping")
                continue

            if not article.text:
                logging.info("article missing text, skipping")
                continue
                
            # Regex the keyword from the article's text
            keywords = get_keywords(article, db_keywords)
            logging.debug(u"matched keywords: {0}".format(repr(keywords)))
            # Regex the links within article's html
            sources = get_sources_sites(article, source_sites)
            logging.debug(u"matched sources: {0}".format(repr(sources)))
            twitter_accounts = get_sources_twitter(article, twitter_accounts_explorer)
            logging.debug(u"matched twitter_accounts: {0}".format(repr(twitter_accounts[0])))

            if((not keywords) or (not sources[0]) or (not twitter_accounts[0])):#[] gets coverted to false
                logging.debug("skipping article because it's not a match")
                continue
            logging.info("match found")

            article.newspaper_parse()

            authors = article.authors
            pub_date = get_pub_date(article)
            # Check if the entry already exists
            db_article_list = Article.objects.filter(url=url)
            if not db_article_list:
                logging.info("Adding new Article to the DB")
                # If the db_article is new to the database,
                # add it to the database
                db_article = Article(title=article.title, url=url,
                                  domain=site["url"],
                                  date_added=timezone.localtime(
                                      timezone.now()),
                                  date_published=pub_date)
                db_article.save()

                db_article = Article.objects.get(url=url)

                for key in keywords:
                    db_article.keyword_set.create(name=key)

                for author in authors:
                    db_article.author_set.create(name=author)
                for account in twitter_accounts[0]:

                    db_article.sourcetwitter_set.create(name = account, matched = True)

                for account in twitter_accounts[1]:
                    db_article.sourcetwitter_set.create(name = account, matched = False)

                for source in sources[0]:
                    db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], matched=True, local=(source[1] in site["url"]))

                for source in sources[1]:
                    db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], matched=False, local=(source[1] in site["url"]))
                added += 1

            else:
                logging.info("Modifying existing Article in the DB")
                # If the db_article already exists,
                # update all fields except date_added
                db_article = db_article_list[0]
                db_article.title = article.title
                db_article.url = url
                db_article.domain = site["url"]
                # Do not update the added date
                # db_article.date_added = today
                db_article.date_published = pub_date
                db_article.save()

                for key in keywords:
                    if not db_article.keyword_set.filter(name=key):
                        db_article.keyword_set.create(name=key)

                for author in authors:
                    if not db_article.author_set.filter(name=author):
                        db_article.author_set.create(name=author)

                for account in twitter_accounts[0]:
                    if not db_article.sourcetwitter_set.filter(name=account):
                        db_article.sourcetwitter_set.create(name = account, matched = True)

                for account in twitter_accounts[1]:
                    if not db_article.sourcetwitter_set.filter(name=account):
                        db_article.sourcetwitter_set.create(name = account, matched = False)

                for source in sources[0]:
                    if not db_article.sourcesite_set.filter(url=source[0]):
                        db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], matched=True, local=(source[1] in site["url"]))

                for source in sources[1]:
                    if not db_article.sourcesite_set.filter(url=source[0]):
                        db_article.sourcesite_set.create(url=source[0],
                                              domain=source[1], matched=False, local=(source[1] in site["url"]))

            warc_creator.create_article_warc(url)
        logging.info("Finished Site: %s"%site['name'])
        print(
            "%s (Article|%s) %i/%i          " %
            (str(timezone.localtime(timezone.now()))[:-13], site["name"],
             processed, article_count))

Пример #5

Показать файл

Файл: Crawler.py Проект: UTMediaCAT/Voyage

    def next(self):
        '''
        (Crawler) -> newspaper.Article
        returns the next article in the sequence
        '''

        #standard non-recursive tree iteration
        with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file:
            try:
                current_level = 0
                while(True):
                    if (self.limit > 0 and self.visited_count > self.limit):
                        raise StopIteration('Limit reached: {:d}'.format(self.limit))
                    # if(self.pages_visited > self.probabilistic_n):
                    #     raise StopIteration
                    # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1")
                    # row = self.cursor.fetchone()
                    # if(row):
                    #     row_id = row[0]
                    #     current_url = row[1]
                    #     self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,))
                    # else:
                    #     raise StopIteration

                    # if(self._should_skip()):
                    #     logging.info(u"skipping {0} randomly".format(current_url))
                    #     continue
                    try:
                        if (self.site.is_shallow):
                            current = self.to_visit.get_nowait()
                            current_url = current[0]
                            current_level = current[1]
                            logging.info(u"Shallow on level {0} {1}".format(current_level, current_url))
                        else:
                            current_url = self.to_visit.get_nowait()
                    except Empty:
                        self.site.is_shallow = True # On line 26 the site gets set TO DELETE
                        self.to_visit.put((self.site.url, str(0)))
                        self.ignore_filter = ScalableBloomFilter(
                        initial_capacity=10000000,
                        error_rate=0.00001)
                        ignore_filter_file.close()
                        os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt')
                        logging.info("stopped iteration")
                        logging.info(u"{0}".format(self.site.url))
                        raise ZeroDivisionError


                    logging.info(u"visiting {0}".format(current_url))
                    self.visited_count += 1
                    #use newspaper to download and parse the article
                    article = ExplorerArticle(current_url)
                    article.download()
                    if (self.site.is_shallow):
                        if (int(current_level) > self.level):
                            continue
                    # get urls from the article
                    for link in article.get_links():
                        url = urljoin(current_url, link.href, False)
                        if self.url_in_filter(url, self.filters):
                            logging.info(u"skipping url \"{0}\" because it matches filter".format(url))
                            continue
                        try:
                            parsed_url = urlparse(url)
                            parsed_as_list = list(parsed_url)

                            if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"):
                                logging.info(u"skipping url with invalid scheme: {0}".format(url))
                                continue
                            parsed_as_list[5] = ''
                            url = urlunparse(urlnorm.norm_tuple(*parsed_as_list))
                        except Exception as e:
                            logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e)))
                            continue
                        if(not parsed_url.netloc.endswith(self.domain)):
                            continue
                        # If the url have been added to ignore list, skip
                        if (url in self.ignore_filter):
                            continue
                        # Ignores the subscribe links for many domains
                        if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")):
                        	continue

                        # Append the url to to_visit queue
                        if (self.site.is_shallow):
                            self.to_visit.put((url, str(int(current_level) + 1)))
                            logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1)))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")
                        else:
                            self.to_visit.put(url)
                            logging.info(u"added {0} to the to_visit".format(url))

                            # Append the url to visited to remove duplicates
                            self.ignore_filter.add(url)
                            ignore_filter_file.write(url.encode('utf8') + "\n")

                    # Update the Queue
                    self.to_visit.task_done()


                    return article


            except StopIteration as e:
                raise e
            except ValueError as e:
                raise ValueError
            except Exception as e:
                raise e

Пример #6

Показать файл

Файл: TestSelector.py Проект: UTMediaCAT/Voyage

import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..',
                                             'Frontend')))
import django
os.environ['DJANGO_SETTINGS_MODULE'] = 'Frontend.settings'
# For Models connecting with the Django Database
from explorer.models import *
from ExplorerArticle import ExplorerArticle

if __name__ == "__main__":
    id = input("site id: ")
    django.setup()
    site = ReferringSite.objects.get(pk=id)
    url = input("url: ")
    article = ExplorerArticle(url)
    article.download()
    article.preliminary_parse()
    article.newspaper_parse()

    fields = {}
    for css in site.referringsitecssselector_set.all():
        if not (css.field_choice in fields.keys()):
            fields[css.field_choice] = []

        fields[css.field_choice].append({'pattern': css.pattern, 'regex': css.regex})
    if(not len(fields.keys())):
        print "no fields"

    for key, value in fields.iteritems():
        print "field \"{0}\"".format(key)