Пример #1
0
    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
        except SGMLParseError as e:
            # Page was poorly formatted, oh well
            _linkscraper_logger.error('Exception encountered when link extracting page')
            return []

        # add these links to our Url item
        urls = list()
        for link in links:
            url = ScrapedUrl()
            url['url'] = link.url
            url['domain'] = UrlUtility.get_domain(link.url)
            url['last_visited'] = datetime(1, 1, 1)
            if url not in urls:
                urls.append(url)

        return urls
Пример #2
0
    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
        except SGMLParseError as e:
            # Page was poorly formatted, oh well
            _linkscraper_logger.error(
                'Exception encountered when link extracting page')
            return []

        # add these links to our Url item
        urls = list()
        for link in links:
            url = ScrapedUrl()
            url['url'] = link.url
            url['domain'] = UrlUtility.get_domain(link.url)
            url['last_visited'] = datetime(1, 1, 1)
            if url not in urls:
                urls.append(url)

        return urls
Пример #3
0
 def clean_organization_url(self):
     url = self.cleaned_data['organization_url']
     if url:
         try:
             return UrlUtility.get_domain(url)
         except:
             raise ValidationError("Oops! We couldn't recognize that URL's domain.")
     else:
         return None
Пример #4
0
 def clean_organization_url(self):
     url = self.cleaned_data['organization_url']
     if url:
         try:
             return UrlUtility.get_domain(url)
         except:
             raise ValidationError(
                 "Oops! We couldn't recognize that URL's domain.")
     else:
         return None
Пример #5
0
    def __init__(self, org_model, id):
        """
        Constructs a new SmallOrganization instance.

        Arguments:
            org_model (Organization): The organization from which to create a SmallOrganization.
            id (ObjectId): The database ID of the organization.
        """
        self.id = id
        self.page_rank_info = org_model.page_rank_info
        try:
            self.org_domain = UrlUtility.get_domain(org_model.organization_url, no_exception=False)
        except Exception:
            self.org_domain = None
        self.page_rank = None
        self.page_rank_weight = None
Пример #6
0
    def __init__(self, org_model, id):
        """
        Constructs a new SmallOrganization instance.

        Arguments:
            org_model (Organization): The organization from which to create a SmallOrganization.
            id (ObjectId): The database ID of the organization.
        """
        self.id = id
        self.page_rank_info = org_model.page_rank_info
        try:
            self.org_domain = UrlUtility.get_domain(org_model.organization_url,
                                                    no_exception=False)
        except Exception:
            self.org_domain = None
        self.page_rank = None
        self.page_rank_weight = None
Пример #7
0
    def _merge_page_rank_info(self, new_references, existing_references, organization_url):
        if existing_references is None:
            return new_references

        org_domain = UrlUtility().get_domain(organization_url)
        for ref in new_references.references:
            ref_exists = False
            # Search for existing references from one organization to another
            for exist_ref in existing_references.references:
                if ref.org_domain == exist_ref.org_domain:
                    # We found existing data for references from Org A to Org B
                    ref_exists = True
                    for page in ref.pages:
                        page_exists = False
                        # Search if we have data from this specific URL to this specific organization
                        for exist_page in exist_ref.pages:
                            if page.url == exist_page.url:
                                # We found existing data for references from URL A to Org B
                                page_exists = True
                                count_diff = page.count - exist_page.count
                                if count_diff != 0:
                                    # This page must have changed b/c the number of references is different
                                    # update everything
                                    exist_page.count = page.count
                                    exist_ref.count += count_diff
                                    existing_references.total_with_self += count_diff
                                    if exist_ref.org_domain != org_domain:
                                        # This value only updated if Organization A and B are different
                                        existing_references.total += count_diff
                                break
                        if not page_exists:
                            # We have recorded other references to this organization, but none from this url
                            exist_ref.pages.append(page)
                            exist_ref.count += page.count
                            existing_references.total_with_self += page.count
                            if exist_ref.org_domain != org_domain:
                                existing_references.total += page.count
                    break
            # If this organization has not yet referenced the specified outside org, add it
            if not ref_exists:
                existing_references.references.append(ref)
                existing_references.total_with_self += ref.count
                if ref.org_domain != org_domain:
                    existing_references.total += ref.count
        return existing_references
Пример #8
0
    def check_valid_org(self, response):
        """
        Checks if the current page is a valid page for an organization's homepage.

        Arguments:
            reponse (Response): Scrapy Response object of the page that is to be scraped.

        Returns:
            True if it's a valid organization page or already in the database.
            False if it's not the homepage.
        """
        # If already in database, then valid
        url = OrgUrlScraper().parse(response)
        org_dto = self.org_dao().find(organization_url=url)
        if org_dto:
            return True

        # If not homepage, then return false and make sure homepage is added to scrape:
        home_url_obj = urlparse(response.request.url)
        if home_url_obj.path and home_url_obj.path is not '/':
            home_url = home_url_obj.scheme + '://' + home_url_obj.netloc + '/'
            home_domain = UrlUtility.get_domain(home_url)
            meta = URLMetadata(url=home_url,
                               domain=home_domain,
                               last_visited=datetime(1, 1, 1))
            self.url_frontier.put_url(meta)
            return False
        else:
            # this is homepage, scrape for keywords
            hxs = HtmlXPathSelector(response)
            site_text = hxs.select('//html//text()').extract()
            site_text = [
                element.strip() for element in site_text
                if element.strip() != ''
            ]

            for word in self._required_words:
                for sentence in site_text:
                    sentence = self._punctuation.sub(' ', sentence)
                    if word in sentence.lower():
                        return True
                        # no keyword found, check if we already added organization

        return False
Пример #9
0
def request_organization(request):
    """
    Sends a request to the Request Organization page if the user is logged in.

    Returns:
        A rendered page containing the Request Organization form.
    """
    if 'user_id' not in request.session:
        logger.error('Bad request made for organization seed without login')
        return unauthorized(request)
    else:
        user_id = request.session['user_id']

    form = RequestOrgForm(request.POST or None)
    error = ''
    success = ''

    if request.method == 'POST':
        if form.is_valid():
            url = form.cleaned_data['url']
            dao = ctx.get_object('URLMetadataDAO')

            try:
                metadata = URLMetadata(url=url,
                                       domain=UrlUtility.get_domain(url))
            except ValueError:
                error = "Oops! We don't recognize that domain. Please try another."

            if not error:
                try:
                    dto = DTOConverter.to_dto(URLMetadataDTO, metadata)
                    dao.create_update(dto)
                    logger.info(
                        'Org seed with url={0} requested by user={1}'.format(
                            url, user_id))
                    success = 'Your request has been sent successfully!'
                except:
                    error = 'Something went wrong with your request. Please try again later.'

    return render(request, 'organization/request_organization.html', {
        'form': form,
        'success': success,
        'error': error
    })
Пример #10
0
    def clean_url(self):
        url = self.cleaned_data['url']
        ctx = ApplicationContext(DAOContext())
        org_dao = ctx.get_object('OrganizationDAO')
        url_metadata_dao = ctx.get_object('URLMetadataDAO')

        try:
            domain = UrlUtility().get_domain(url)
        except:
            raise ValidationError(
                "Oops! We couldn't find information on that domain.")

        if org_dao.find(organization_url=domain) or url_metadata_dao.find(
                domain=domain):
            raise ValidationError(
                "Oops! Looks like we already have information on that organization."
            )

        return url
Пример #11
0
    def check_valid_org(self, response):
        """
        Checks if the current page is a valid page for an organization's homepage.

        Arguments:
            reponse (Response): Scrapy Response object of the page that is to be scraped.

        Returns:
            True if it's a valid organization page or already in the database.
            False if it's not the homepage.
        """
        # If already in database, then valid
        url = OrgUrlScraper().parse(response)
        org_dto = self.org_dao().find(organization_url=url)
        if org_dto:
            return True

        # If not homepage, then return false and make sure homepage is added to scrape:
        home_url_obj = urlparse(response.request.url)
        if home_url_obj.path and home_url_obj.path is not '/':
            home_url = home_url_obj.scheme + '://' + home_url_obj.netloc + '/'
            home_domain = UrlUtility.get_domain(home_url)
            meta = URLMetadata(url=home_url, domain=home_domain, last_visited=datetime(1, 1, 1))
            self.url_frontier.put_url(meta)
            return False
        else:
            # this is homepage, scrape for keywords
            hxs = HtmlXPathSelector(response)
            site_text = hxs.select('//html//text()').extract()
            site_text = [element.strip() for element in site_text if element.strip() != '']

            for word in self._required_words:
                for sentence in site_text:
                    sentence = self._punctuation.sub(' ', sentence)
                    if word in sentence.lower():
                        return True
                        # no keyword found, check if we already added organization

        return False
Пример #12
0
def request_organization(request):
    """
    Sends a request to the Request Organization page if the user is logged in.

    Returns:
        A rendered page containing the Request Organization form.
    """
    if 'user_id' not in request.session:
        logger.error('Bad request made for organization seed without login')
        return unauthorized(request)
    else:
        user_id = request.session['user_id']

    form = RequestOrgForm(request.POST or None)
    error = ''
    success = ''

    if request.method == 'POST':
        if form.is_valid():
            url = form.cleaned_data['url']
            dao = ctx.get_object('URLMetadataDAO')

            try:
                metadata = URLMetadata(url=url, domain=UrlUtility.get_domain(url))
            except ValueError:
                error = "Oops! We don't recognize that domain. Please try another."

            if not error:
                try:
                    dto = DTOConverter.to_dto(URLMetadataDTO, metadata)
                    dao.create_update(dto)
                    logger.info('Org seed with url={0} requested by user={1}'.format(url, user_id))
                    success = 'Your request has been sent successfully!'
                except:
                    error = 'Something went wrong with your request. Please try again later.'

    return render(request, 'organization/request_organization.html', {'form': form, 'success': success, 'error': error})
Пример #13
0
    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # get domain
        try:
            org_domain = UrlUtility.get_domain(response.request.url, False)
        except Exception as e:
            _linkscraper_logger.error('Exception encountered when trying to find the domain of ' + response.request.url)

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
        except SGMLParseError as e:
            # Page was poorly formatted, oh well
            _linkscraper_logger.error('Exception encountered when PageRankInfo scraping page')
            return None

        # add these links to our Page Rank Info
        page_rank_info = {
            "total": 0,
            "total_with_self": 0,
            "references": []
        }
        for link in links:
            url = link.url
            try:
                domain = UrlUtility.get_domain(url, False)
            except Exception as e:
                _linkscraper_logger.error('Exception encountered when trying to find the domain of ' + url)
                continue
            ref_found = False
            for ref in page_rank_info["references"]:
                if ref["org_domain"] == domain:
                    ref_found = True
                    ref["count"] += 1
                    ref["pages"][0]["count"] += 1
                    page_rank_info["total_with_self"] += 1
                    if domain != org_domain:
                        page_rank_info["total"] += 1
                    break;
            if not ref_found:
                page_rank_info["references"].append(
                    {
                        "org_domain": domain,
                        "count": 1,
                        "pages": [
                            {
                                "url": response.url,
                                "count": 1
                            }
                        ]
                    }
                )
                page_rank_info["total_with_self"] += 1
                if domain != org_domain:
                    page_rank_info["total"] += 1

        return page_rank_info
Пример #14
0
    def parse(self, response):
        """Scrape a spider's HttpRequest.Response for links"""

        # get domain
        try:
            org_domain = UrlUtility.get_domain(response.request.url, False)
        except Exception as e:
            _linkscraper_logger.error(
                'Exception encountered when trying to find the domain of ' +
                response.request.url)

        # sanity check
        if self._link_extractor is None:
            self._link_extractor = SgmlLinkExtractor()

        # use scrapy SgmlLinkExtractor to extract links
        try:
            links = self._link_extractor.extract_links(response)
        except SGMLParseError as e:
            # Page was poorly formatted, oh well
            _linkscraper_logger.error(
                'Exception encountered when PageRankInfo scraping page')
            return None

        # add these links to our Page Rank Info
        page_rank_info = {"total": 0, "total_with_self": 0, "references": []}
        for link in links:
            url = link.url
            try:
                domain = UrlUtility.get_domain(url, False)
            except Exception as e:
                _linkscraper_logger.error(
                    'Exception encountered when trying to find the domain of '
                    + url)
                continue
            ref_found = False
            for ref in page_rank_info["references"]:
                if ref["org_domain"] == domain:
                    ref_found = True
                    ref["count"] += 1
                    ref["pages"][0]["count"] += 1
                    page_rank_info["total_with_self"] += 1
                    if domain != org_domain:
                        page_rank_info["total"] += 1
                    break
            if not ref_found:
                page_rank_info["references"].append({
                    "org_domain":
                    domain,
                    "count":
                    1,
                    "pages": [{
                        "url": response.url,
                        "count": 1
                    }]
                })
                page_rank_info["total_with_self"] += 1
                if domain != org_domain:
                    page_rank_info["total"] += 1

        return page_rank_info