Пример #1
0
def main():
    url = "http://matfystutor.dk/rus/holdtutorer/"
    r = requests.get(url)
    item, = microdata.get_items(r.text)

    year = item.year

    rusclass_list = []

    o = item.get_all('rusclass')
    for rusclass in o:
        tutors = []
        for tutor in rusclass.get_all("tutor"):
            name = tutor.name
            phone = strip_prefix(str(tutor.phone), "tel:")
            tutors.append({
                'name': name,
                'phone': phone,
            })

        rusclass_list.append({
            'name': rusclass.name,
            'tutors': tutors,
        })

    template_name = 'templates/rusclass/tutorhold.tex'
    context = {'rusclass_list': rusclass_list, 'year': year}
    print(render_to_string(template_name, context))
Пример #2
0
 def parse(self, source, sink, **kwargs):
     """
     Pass in a file or file-like object containing html5 microdata
     and populate the sink graph with triples.
     """
     for item in microdata.get_items(source.getByteStream()):
         self._add_item(item, sink)
Пример #3
0
    def update_price(self):
        # check if this is an Amazon product
        if self.distributor.name == 'Amazon':
            amazon = AmazonAPI(settings.AWS_ACCESS_KEY_ID, settings.AWS_SECRET_ACCESS_KEY, settings.AWS_ASSOCIATE_TAG)
            try:
                product = amazon.lookup(ItemId=self.part.asin)
                price = product.price_and_currency
                return price[0]
            except:
                pass
        else:
            import urllib2
            from lxml import etree
            import microdata
            import urllib

            items = microdata.get_items(urllib.urlopen(self.url))
            for i in items:
                if i.offers:
                    return "%s (md)".replace("$", "") % i.offers.price.strip().replace("$", "")
            html = urllib2.urlopen(self.url).read()
            tree = etree.HTML(html)
            price = tree.xpath("%s/text()[1]" % self.xpath)
            try:
                return "%s (xp)" % price[0].replace("$", "")
            except:
                return "N/A"
Пример #4
0
def get_from_html(html_text, url):
    soup = BeautifulSoup(html_text, "html.parser")

    # first try finding ld+json as its most common
    for ld in soup.find_all('script', type='application/ld+json'):
        try:
            ld_json = json.loads(ld.string.replace('\n', ''))
            if type(ld_json) != list:
                ld_json = [ld_json]

            for ld_json_item in ld_json:
                # recipes type might be wrapped in @graph type
                if '@graph' in ld_json_item:
                    for x in ld_json_item['@graph']:
                        if '@type' in x and x['@type'] == 'Recipe':
                            ld_json_item = x

                if '@type' in ld_json_item and ld_json_item['@type'] == 'Recipe':
                    return find_recipe_json(ld_json_item, url)
        except JSONDecodeError as e:
            return JsonResponse({'error': True, 'msg': _('The requested site provided malformed data and cannot be read.')}, status=400)

    # now try to find microdata
    items = microdata.get_items(html_text)
    for i in items:
        md_json = json.loads(i.json())
        if 'schema.org/Recipe' in str(md_json['type']):
            return find_recipe_json(md_json['properties'], url)

    return JsonResponse({'error': True, 'msg': _('The requested site does not provide any recognized data format to import the recipe from.')}, status=400)
Пример #5
0
def parseMicrodata(url):
    req = urllib2.Request(url)
    try:
        response = urllib2.urlopen(req)
    except urllib2.URLError as err:
        logger.error("Error while fetching %s: %s" % (url, err.msg))
        raise

    items = microdata.get_items(response)

    event_list = []

    for ev in filter(
            lambda x: microdata.URI("http://schema.org/Event") in x.itemtype,
            items):
        start = datetime.strptime(ev.startdate, "%Y-%m-%dT%H:%M:%SZ")
        start = start.replace(tzinfo=timezone("UTC")).astimezone(tz)

        if (ev.enddate):
            end = datetime.strptime(ev.startdate, "%Y-%m-%dT%H:%M:%SZ")
            end = end.replace(tzinfo=timezone("UTC"))
        else:
            end = start + timedelta(hours=1)

        event_data = {
            "title": ev.name,
            "description": ev.name,
            "start": start.strftime(dt_format),
            "end": end.strftime(dt_format),
            "location": ev.location.name,
            "url": urljoin(url, str(ev.url))
        }
        event_list.append(event_data)

    return event_list
Пример #6
0
    def bookmarklet(self, request):
        """
        Fetches the recipe for the url, saves the recipe, and returns a response to the chrome extension
        """
        u = ICurrentUser(request)

        url = request.args['uri'][0]
        pageSource = yield treq.get(url).addCallback(treq.content)
        items = microdata.get_items(pageSource)
        recipesSaved = []

        for i in items:
            itemTypeArray = [x.string for x in i.itemtype]
            if RECIPE_SCHEMA in itemTypeArray:
                recipe = i
                saveItem = Recipe.fromMicrodata(recipe, u.email)
                Recipe.saveOnlyOnce(saveItem)
                recipesSaved.append({
                    "name": saveItem.name,
                    "urlKey": saveItem.urlKey
                })
                break

        if len(recipesSaved) == 0:
            defer.returnValue(
                ClipResponse(status=RS.error, message=ResponseMsg.noRecipe))

        defer.returnValue(ClipResponse(status=RS.ok, recipes=recipesSaved))
Пример #7
0
def extract_microdata_from_html(html_str):
    import microdata
    try:
        items = microdata.get_items(html_str)
        return [item.json_dict() for item in items]
    except Exception as e:
        return [{'extraction_error': str(e)}]
Пример #8
0
    def test_parse_multiple_props(self):
        items = get_items(open("test-data/multiple-props.html"))

        self.assertEqual(len(items), 2)

        item = items[0]
        i = json.loads(item.json())
        # both names `John Doe and Jane Dun` should appear under author and creator props
        self.assertEqual(
            len(i["properties"]["author"][0]["properties"]["name"]), 2)
        self.assertEqual(i["properties"]["author"][0]["properties"]["name"],
                         ["John Doe", "Jane Dun"])
        self.assertTrue(
            len(i["properties"]["creator"][0]["properties"]["name"]), 2)
        self.assertEqual(i["properties"]["creator"][0]["properties"]["name"],
                         ["John Doe", "Jane Dun"])

        # nested multiple props
        self.assertEqual(item.author.affiliation.name, "Stanford University")
        self.assertEqual(item.creator.affiliation.name, "Stanford University")
        self.assertEqual(item.author.alumniOf.name, "Stanford University")
        self.assertEqual(item.creator.alumniOf.name, "Stanford University")

        item = items[1]
        i = json.loads(item.json())
        # test case for original issue #3
        self.assertTrue(i["properties"]["favorite-color"][0], "orange")
        self.assertTrue(i["properties"]["favorite-fruit"][0], "orange")
Пример #9
0
def parseMicrodata(url):
	req = urllib2.Request(url)
	try:
		response = urllib2.urlopen(req)
	except urllib2.URLError as err:
		logger.error("Error while fetching %s: %s" % (url, err.msg))
		raise

	items = microdata.get_items(response)

	event_list = []

	for ev in filter(lambda x: microdata.URI("http://schema.org/Event") in x.itemtype, items):
		start = datetime.strptime(ev.startdate, "%Y-%m-%dT%H:%M:%SZ")
		start = start.replace(tzinfo=timezone("UTC")).astimezone(tz)

		if (ev.enddate):
			end = datetime.strptime(ev.startdate, "%Y-%m-%dT%H:%M:%SZ")
			end = end.replace(tzinfo=timezone("UTC"))
		else:
			end = start + timedelta(hours = 1)

		event_data = {
			"title": ev.name,
			"description": ev.name,
			"start": start.strftime(dt_format),
			"end": end.strftime(dt_format),
			"location": ev.location.name,
			"url": urljoin (url, str(ev.url))
		}
		event_list.append(event_data)

	return event_list
Пример #10
0
def get_comments(recipe=None):
    url = recipe['source_url']
    title = recipe['title']
    items = microdata.get_items(urllib.urlopen(url))
    
    for item in items:
        if len(item.get_all('ingredients')) > 0:
            try:
                ingredients = [' '.join(i.replace('\n', '').split()) for i in item.get_all('ingredients')]
            except TypeError:
                return None
            for ing in ingredients:
                found = False
                ings = ing.split(' ')
                for i, word in enumerate(ings):
                    for m in measurements:
                        if word.startswith(m):
                            ings = ings[i+1:]
                            found = True if ings else False
                if found:
                    final = ' '.join(ings)
                    mesg = u"{}: {} {} with {}. {} ".format(title,
                                                              random.sample(starts, 1)[0],
                                                              final,
                                                              random.sample(foods, 1)[0],
                                                              random.sample(comments, 1)[0],
                    )
                    if len(mesg) > (140 - 23 - 23):
                        # This message is too long
                        print "Message was too long; retrying"
                        return None
                    else:
                        return mesg + url
Пример #11
0
    def test_parse_nested(self):

        # parse the html for microdata
        items = get_items(open("test-data/example-nested.html"))

        # this html should have just one main item
        self.assertTrue(len(items), 1)

        item = items[0]

        # item's type should be set
        self.assertEqual(item.itemtype, [URI("http://schema.org/Event")])

        # test case of a nested itemprop
        self.assertEqual(item.name.strip(), "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)")

        # test case of a nested itemscope
        self.assertTrue(isinstance(item.location, Item))
        self.assertEqual(item.location.itemtype, [URI("http://schema.org/Place")])
        self.assertEqual(item.location.url, URI("wells-fargo-center.html"))

        # address should be a nested item
        self.assertTrue(isinstance(item.location.address, Item))
        self.assertEqual(item.location.address.itemtype, [URI("http://schema.org/PostalAddress")])
        self.assertTrue(item.location.address.addressLocality, "Philadelphia")

        # json
        i = json.loads(item.json())
        self.assertEqual(i["properties"]["name"][0].strip(), "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)")
        self.assertEqual(i["type"], ["http://schema.org/Event"])
        self.assertEqual(i["properties"]["url"], ["nba-miami-philidelphia-game3.html"])
        self.assertTrue(isinstance(i["properties"]["location"][0], dict))
        self.assertEqual(i["properties"]["location"][0]["properties"]["url"][0], "wells-fargo-center.html")
        self.assertTrue(isinstance(i["properties"]["location"][0]["properties"]["address"][0], dict))
        self.assertEqual(i["properties"]["location"][0]["properties"]["address"][0]["properties"]["addressLocality"][0], "Philadelphia")
Пример #12
0
 def parse(self, source, sink, **kwargs):
     """
     Pass in a file or file-like object containing html5 microdata
     and populate the sink graph with triples.
     """
     for item in microdata.get_items(source.getByteStream()):
         self._add_item(item, sink)
Пример #13
0
 def find_rating(self, title):
     tt_uri = self.IMDB_TITLE_URI % title
     try:
         page = microdata.get_items(urlopen(tt_uri))
         return page[0].aggregateRating.ratingValue
     except (AttributeError, IndexError) as e:
         self.l.debug("Parsed microdata content: " + str(page))
         self.l.error("Error parsing IMDB microdata: " + str(e))
Пример #14
0
def get(htmldoc):
    """Get page data."""
    data = {}

    items = microdata.get_items(htmldoc)
    movie_item = items[0]
    data['microdata'] = movie_item

    return data
Пример #15
0
def get_comments_from_article(guardian_article_url):
    read_url = urllib.urlopen(guardian_article_url)
    microdata_entities = microdata.get_items(read_url)
    entities = [json.loads(entity.json()) for entity in microdata_entities]
    comments = [
        entity['properties'] for entity in entities
        if 'http://schema.org/Comment' in entity['type']
    ]
    return comments
Пример #16
0
def from_microdata(content):
    result = []
    for item in microdata.get_items(content):
        if item.itemtype == [microdata.URI('http://schema.org/JobPosting')]:
            job_posting = JobPosting()
            job_posting.title = item.title
            job_posting._original_format = 'microdata'
            result.append(job_posting)
    return result
Пример #17
0
def from_microdata(content):
    result = []
    for item in microdata.get_items(content):
        if item.itemtype == [microdata.URI('http://schema.org/JobPosting')]:
            job_posting = JobPosting()
            job_posting.title = item.title
            job_posting._original_format = 'microdata'
            result.append(job_posting)
    return result
Пример #18
0
 def download(self,url):
     self.url = url
     items = microdata.get_items(urllib.request.urlopen(self.url))
     item = items[0]
     self.set_text(item.articleBody)
     self.set_title(item.alternativeHeadline)
     self.set_thumbnailUrl(item.thumbnailUrl)
     #self.set_summary(item.articleBody)
     self.json = item.json()
Пример #19
0
def microdata_extract(url_or_resource, itemprop):
    """
    Extracts via microdata scraping
    Make sure to add two scrapers/packages in case one of them fails
    Use try-except for scarping
    """

    resource = isinstance(
        url_or_resource,
        Resource) and url_or_resource or Resource(url_or_resource)

    try:
        # Calling get_contents to get the html contents
        html_content = resource.get_contents_as_file()
        items = microdata_library.get_items(html_content)
    except ValueError:
        return None

    try:
        # Getting json data from the html content
        item = items[0]

        item_data = item.json()
        json_item_data = json.loads(item_data)
        try:
            itemprop = itemprop.split('/')
        except AttributeError:
            return None

        # Removing empty lists
        itemprop = [key for key in itemprop if key]

        itemprop_list = []
        for tag in itemprop:
            try:
                tag = int(tag)
                itemprop_list.append(tag)
            except ValueError:
                itemprop_list.append(tag)
                pass

        try:

            def f(iterable, key):
                return iterable[key]

            #Reducing the json data to required value
            return reduce(f, itemprop_list, json_item_data)
        except IndexError:
            return None
        except KeyError:
            return None

    # Exception incase there is no microdata in the html content
    except IndexError:
        return None
Пример #20
0
def scrape_recipe(recipe_url) :
    if recipe_exists(recipe_url):
        #print "Not Scraping " + recipe_url + ", already exists"
        return True
    else:
        #print "Scraping recipe microdata: " + recipe_url
        items = microdata.get_items(urllib.urlopen(recipe_url))
        for item in items:
            print "Scraping: " + item.name + " from " + recipe_url
            recipe_model = { "url" : recipe_url, "name" : item.name, "recipe" : item.json() }
            scraperwiki.sqlite.save(unique_keys=["url"], table_name="recipes", data=recipe_model)
        return True
Пример #21
0
def parse_content_microdata(parse_url):
        # https://developers.google.com/structured-data/testing-tool/
        # not working with http://www.bonprix.de/produkt/maxi-jerseykleid-dunkelblau-bedruckt-958483/
        # which is good microformat according to google

        print parse_url

        items = microdata.get_items(urllib.urlopen(parse_url))
        data = [i.json_dict() for i in items]

        pp.pprint(data)
        return data
Пример #22
0
 def scrapSoftwareApplicationSchema(self, html):
     # extract microdata from html
     items = microdata.get_items(html)
     softwareApps = []
     for item in items:
         # from all itemscope elements, filter by SoftwareApplication
         for itemtype in item.itemtype:
             if  itemtype.string == SoftwareApp.ENTITY_TYPE:
                 props = self._getEntityPropertyValuesFromMicroItem(item)
                 softwareApp = SoftwareApp(props)
                 softwareApps.append(softwareApp)
             break
     return softwareApps
Пример #23
0
    def load(self):
        """Retrieves the data for this object from the WikiTree server.
        This happens automatically when any of the properties are accessed.

        >>> p = Person('Sloan-518')
        >>> p.load()
        """
        items = microdata.get_items(urllib.request.urlopen(self.url))
        data = items[0].json_dict()['properties']

        self.__dict__ = self.__process_microdata__(None, data)
        self.__data__ = data
        self.__loaded__ = True
Пример #24
0
def get_data(url):
    """ Uses the metadata module to parse the metadata from the provided URL """
    try:
        request = requests.get(url)
    except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e:
        raise ParseError(e)

    items = microdata.get_items(request.text)

    for item in items:
        if item.itemtype == [microdata.URI("http://schema.org/Recipe")]:
            return item

    raise ParseError("No recipe data found")
Пример #25
0
    def test_parse_nested(self):

        # parse the html for microdata
        with open("test-data/example-nested.html") as f:
            items = get_items(f)

        # this html should have just one main item
        self.assertTrue(len(items), 1)

        item = items[0]

        # item's type should be set
        self.assertEqual(item.itemtype, [URI("http://schema.org/Event")])

        # test case of a nested itemprop
        self.assertEqual(
            item.name.strip(),
            "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)")

        # test case of a nested itemscope
        self.assertTrue(isinstance(item.location, Item))
        self.assertEqual(item.location.itemtype,
                         [URI("http://schema.org/Place")])
        self.assertEqual(item.location.url, URI("wells-fargo-center.html"))

        # address should be a nested item
        self.assertTrue(isinstance(item.location.address, Item))
        self.assertEqual(item.location.address.itemtype,
                         [URI("http://schema.org/PostalAddress")])
        self.assertTrue(item.location.address.addressLocality, "Philadelphia")

        # json
        i = json.loads(item.json())
        self.assertEqual(
            i["properties"]["name"][0].strip(),
            "Miami Heat at Philadelphia 76ers - Game 3 (Home Game 1)")
        self.assertEqual(i["type"], ["http://schema.org/Event"])
        self.assertEqual(i["properties"]["url"],
                         ["nba-miami-philidelphia-game3.html"])
        self.assertTrue(isinstance(i["properties"]["location"][0], dict))
        self.assertEqual(
            i["properties"]["location"][0]["properties"]["url"][0],
            "wells-fargo-center.html")
        self.assertTrue(
            isinstance(
                i["properties"]["location"][0]["properties"]["address"][0],
                dict))
        self.assertEqual(
            i["properties"]["location"][0]["properties"]["address"][0]
            ["properties"]["addressLocality"][0], "Philadelphia")
Пример #26
0
def get_article_body(nytimes_article_url):
    try:
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
        response = opener.open(nytimes_article_url)
        html = response.read()
        microdata_entities = microdata.get_items(html)
        entities = [json.loads(entity.json()) for entity in microdata_entities]
        body = []
        for entity in entities:
            body += entity[u'properties'][u'articleBody']
    except:
        return []

    return body
Пример #27
0
def get_article_body(guardian_article_url):
    try:
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
        response = opener.open(guardian_article_url)
        html = response.read()
        microdata_entities = microdata.get_items(html)
        entities = [json.loads(entity.json()) for entity in microdata_entities]
        body = []
        for entity in entities:
            if entity[u'type'] == [u'http://schema.org/NewsArticle']:
                return entity[u'properties']
    except:
        return []

    return body
Пример #28
0
def get_data(url):
    """ Uses the metadata module to parse the metadata from the provided URL """
    try:
        request = requests.get(url)
        request.raise_for_status()
    except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError) as e:
        raise ParseError(e)

    items = microdata.get_items(request.text)

    for item in items:
        if item.itemtype == [microdata.URI("http://schema.org/Recipe")]:
            return item

    raise ParseError("No recipe data found")
def parseHtml(url):
  '''
    Metodo que faz o parse dos dados estruturados existentes no html
    usando a classe microdata:https://github.com/edsu/microdata
    e as libs 
          html5lib:https://github.com/html5lib/html5lib-python
          lxml:http://lxml.de/index.html
  '''
  location=urllib.urlopen(url)
  html=lhtml.fromstring(location.read())
  softwareHtmlString= html.get_element_by_id("software")
  items = microdata.get_items(lhtml.tostring(softwareHtmlString))
  if len(items):
    return items[0]
  else:
    return None
Пример #30
0
def microdata_filter(site_id):
    products = []
    schema_product_type = 'http://schema.org/Product'

    data_file_path = config.URL_CRAWLED_DATA_DIR + str(site_id)
    if not os.path.exists(data_file_path):
        return False, None, None, None

    with open(data_file_path, 'rb') as f:
        encoding = chardet.detect(f.read())['encoding']
        items = microdata.get_items(f, encoding)
    if not items:
        return False, None, None, None

    for item in items:
        item = json.loads(item.json())
        if item.get('type')[0] == schema_product_type and item.get(
                'properties').get('offers'):
            product_price = None
            product_currency = None
            try:
                product_price = item.get('properties').get('offers')[0].get(
                    'properties').get('price')[0]
            except Exception as e:
                print(e)
            try:
                product_currency = item.get('properties').get('offers')[0].get(
                    'properties').get('priceCurrency')[0]
            except Exception as e:
                print(e)

            if product_price:
                product = {
                    'price':
                    price_formatter(product_price)[0]
                    if product_price else None,
                    'currency':
                    product_currency
                }
                products.append(product)

    if len(products) == 0:
        return False, None, None, None
    else:
        product = products[0]
        return True, product.get('price'), product.get(
            'currency'), datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
Пример #31
0
    def test_parse(self):

        # parse the html for microdata
        with open('test-data/example.html') as f:
            items = get_items(f)

        # this html should have just one main item
        self.assertTrue(len(items), 1)

        item = items[0]

        # item's type should be set
        self.assertEqual(item.itemtype, [URI("http://schema.org/Person")])

        # test simple case of a single valued property
        self.assertEqual(item.name, "Jane Doe")

        # but object properties can have multiple values ...

        # basic accessor returns the first value
        self.assertEqual(item.colleagues,
                         URI("http://www.xyz.edu/students/alicejones.html"))

        # and get_all, well, gets them all of course :)
        self.assertEqual(item.get_all("colleagues"), [
            URI("http://www.xyz.edu/students/alicejones.html"),
            URI("http://www.xyz.edu/students/bobsmith.html")
        ])

        # address should be another item
        self.assertTrue(isinstance(item.address, Item))
        self.assertEqual(item.address.itemtype,
                         [URI("http://schema.org/PostalAddress")])
        self.assertTrue(item.address.addressLocality, "Seattle")

        # <script> tag should be ignored in the content text
        self.assertFalse("Unrelated text" in item.address.streetAddress)

        # json
        i = json.loads(item.json())
        self.assertEqual(i["properties"]["name"][0], "Jane Doe")
        self.assertEqual(i["type"], ["http://schema.org/Person"])
        self.assertEqual(i["id"], "http://www.xyz.edu/~jane")
        self.assertTrue(isinstance(i["properties"]["address"][0], dict))
        self.assertEqual(
            i["properties"]["address"][0]["properties"]["addressLocality"][0],
            "Seattle")
Пример #32
0
    def test_parse_unlinked(self):
        items = get_items(open("test-data/unlinked.html"))
        self.assertEqual(len(items), 2)

        i = items[0]
        self.assertEqual(i.itemtype, [URI("http://schema.org/Person")])
        self.assertEqual(i.name, "Jane Doe")
        self.assertEqual(i.streetAddress, None)

        # this PostalAddress is enclosed within the Person but it is
        # not linked via the streetAddress itemprop. This particular example
        # would represent a bug in the markup, but technically items can appear
        # within other items without them being related together with an
        # itemprop.

        i = items[1]
        self.assertEqual(i.itemtype, [URI("http://schema.org/PostalAddress")])
        self.assertTrue("Whitworth" in i.streetAddress)
Пример #33
0
    def test_parse_unlinked(self):
        items = get_items(open("test-data/unlinked.html"))
        self.assertEqual(len(items), 2)

        i = items[0]
        self.assertEqual(i.itemtype, [URI("http://schema.org/Person")])
        self.assertEqual(i.name, "Jane Doe")
        self.assertEqual(i.streetAddress, None)

        # this PostalAddress is enclosed within the Person but it is
        # not linked via the streetAddress itemprop. This particular example
        # would represent a bug in the markup, but technically items can appear
        # within other items without them being related together with an
        # itemprop.

        i = items[1]
        self.assertEqual(i.itemtype, [URI("http://schema.org/PostalAddress")])
        self.assertTrue('Whitworth' in i.streetAddress)
Пример #34
0
def scrape_recipe(recipe_url):
    if recipe_exists(recipe_url):
        #print "Not Scraping " + recipe_url + ", already exists"
        return True
    else:
        #print "Scraping recipe microdata: " + recipe_url
        items = microdata.get_items(urllib.urlopen(recipe_url))
        for item in items:
            print "Scraping: " + item.name + " from " + recipe_url
            recipe_model = {
                "url": recipe_url,
                "name": item.name,
                "recipe": item.json()
            }
            scraperwiki.sqlite.save(unique_keys=["url"],
                                    table_name="recipes",
                                    data=recipe_model)
        return True
Пример #35
0
    def get_recipe_from_file(self, file):
        html_text = file.getvalue().decode("utf-8")

        items = microdata.get_items(html_text)
        for i in items:
            md_json = json.loads(i.json())
            if 'schema.org/Recipe' in str(md_json['type']):
                recipe_json = find_recipe_json(md_json['properties'], '')
                recipe = Recipe.objects.create(
                    name=recipe_json['name'].strip(),
                    created_by=self.request.user,
                    internal=True)
                step = Step.objects.create(
                    instruction=recipe_json['recipeInstructions'])

                for ingredient in recipe_json['recipeIngredient']:
                    f, created = Food.objects.get_or_create(
                        name=ingredient['ingredient']['text'])
                    u, created = Unit.objects.get_or_create(
                        name=ingredient['unit']['text'])
                    step.ingredients.add(
                        Ingredient.objects.create(food=f,
                                                  unit=u,
                                                  amount=ingredient['amount'],
                                                  note=ingredient['note']))

                recipe.steps.add(step)

                soup = BeautifulSoup(html_text, "html.parser")
                image = soup.find('img')
                image_name = image.attrs['src'].strip().replace('Images/', '')

                for f in self.files:
                    if '.zip' in f.name:
                        import_zip = ZipFile(f.file)
                        for z in import_zip.filelist:
                            if re.match(f'^Recipes/Images/{image_name}$',
                                        z.filename):
                                self.import_recipe_image(
                                    recipe,
                                    BytesIO(import_zip.read(z.filename)))

                return recipe
Пример #36
0
    def get_microdata_items(self, response):
        items = microdata.get_items(response.text)

        result = []

        for item in items:
            if 'ClaimReview' in str(item.itemtype[-1]):
                rr = item.get('reviewRating')
                img = item.get('image')
                ir = item.get('itemReviewed')
                url = str(item.get('url'))

                result.append(
                    dict(
                        type=str(item.itemtype[-1]),
                        datePublished=self.parse_date(
                            item.get('datePublished')),
                        dateModified=self.parse_date(item.get('dateModified')),
                        url=url,
                        author=self.microdata_authors_from(item),
                        image=dict(type=str(img.itemtype[-1]),
                                   url=str(img.get('url')),
                                   width=img.get('width'),
                                   height=img.get('height')) if img else None,
                        claimReviewed=item.get('claimReviewed'),
                        reviewRating=dict(type=str(rr.itemtype[-1]),
                                          ratingValue=rr.get('ratingValue'),
                                          bestRating=rr.get('bestRating'),
                                          worstRating=rr.get('worstRating'),
                                          alternateName=rr.get('alternateName')
                                          or rr.get('name')),
                        itemReviewed=dict(
                            type=str(ir.itemtype[-1]),
                            author=self.microdata_authors_from(ir),
                            datePublished=self.parse_date(
                                ir.get('datePublished')),
                            sameAs=[str(s) for s in ir.get_all('sameAs')]),
                        keywords=str(item.get('keywords'))
                        if item.get('keywords') else None,
                    ))

        return result
Пример #37
0
    def test_parse(self):

        # parse the html for microdata
        items = get_items(open("test-data/example.html"))

        # this html should have just one main item
        self.assertTrue(len(items), 1)

        item = items[0]

        # item's type should be set
        self.assertEqual(item.itemtype, [URI("http://schema.org/Person")])

        # test simple case of a single valued property
        self.assertEqual(item.name, "Jane Doe")

        # but object properties can have multiple values ...

        # basic accessor returns the first value
        self.assertEqual(item.colleagues,
                URI("http://www.xyz.edu/students/alicejones.html"))

        # and get_all, well, gets them all of course :)
        self.assertEqual(item.get_all("colleagues"),
                [URI("http://www.xyz.edu/students/alicejones.html"),
                 URI("http://www.xyz.edu/students/bobsmith.html")])

        # address should be another item
        self.assertTrue(isinstance(item.address, Item))
        self.assertEqual(item.address.itemtype, [URI("http://schema.org/PostalAddress")])
        self.assertTrue(item.address.addressLocality, "Seattle")

        # <script> tag should be ignored in the content text
        self.assertFalse("Unrelated text" in item.address.streetAddress)

        # json
        i = json.loads(item.json())
        self.assertEqual(i["properties"]["name"][0], "Jane Doe")
        self.assertEqual(i["type"], ["http://schema.org/Person"])
        self.assertEqual(i["id"], "http://www.xyz.edu/~jane")
        self.assertTrue(isinstance(i["properties"]["address"][0], dict))
        self.assertEqual(i["properties"]["address"][0]["properties"]["addressLocality"][0], "Seattle")
Пример #38
0
def scrape(url):
    try:
        parsed_uri = urlparse(url)
        response = urlopen(url)
        html = response.read()
        items = microdata.get_items(html)
        recipe = None
        if len(items) > 0:
            for item in items:
                if str(item.itemtype[0]).endswith("/Recipe"):
                    instructions = []
                    ingredients = []
                    if item.recipeInstructions is not None:
                        for instruction in item.get_all("recipeInstructions"):
                            splitted = instruction.split("\n")
                            for line in splitted:
                                clean = line.strip()
                                if clean != "":
                                    instructions.append(clean)
                    if item.ingredients is not None:
                        ingredients = item.get_all("ingredients")
                    if item.recipeIngredient is not None:
                        ingredients = item.get_all("recipeIngredient")
                    if len(ingredients) > 0 and len(instructions) > 0:
                        recipe = {
                            "name": item.name,
                            "ingredients": ingredients,
                            "instructions": instructions
                        }
        soup = BeautifulSoup(markup=html, features="html5lib")
        hrefs = []
        for link in soup.findAll('a'):
            href = link.get('href')
            parsed_sub_uri = urlparse(href)
            if parsed_sub_uri.netloc == "" or parsed_sub_uri.netloc == parsed_uri.netloc:
                new_url = parsed_uri.scheme + "://" + parsed_uri.netloc + parsed_sub_uri.path
                if new_url != url:
                    hrefs.append(new_url)
        return set(hrefs), recipe
    except Exception as e:
        print(e)
        return set([]), None
Пример #39
0
    def get(self):
        url = self.request.get('url')
        if not url:
            self.redirect('/')

        extracted = {}
        extracted['items'] = items = []

        url_contents = urllib.urlopen(url).read()

        for item in microdata.get_items(url_contents):
            items.append(item.json_dict())

        context = {
            "url": url,
            "request_url": self.request.url,
            "extracted": json.dumps(extracted, indent=4),
            "items": items,
            "access_date": datetime.date.today(),
            "show_wikipedia": self.request.get('wikipedia', 'off') == 'on'
        }

        url_parts = urlsplit(url)
        site_name = url_parts.netloc
        if site_name.endswith('wdl.org'):
            site_name = 'WDL'
            wiki_site_name = '[[World Digital Library]]'
        else:
            wiki_site_name = site_name

        context['site_name'] = site_name
        context['wiki_site_name'] = wiki_site_name

        best_match = self.request.accept.best_match(['application/json', 'text/html'])
        if best_match == 'application/json':
            self.response.content_type = 'application/json'
            self.response.write(context['extracted'])
        else:
            template = JINJA_ENVIRONMENT.get_template('index.html')
            self.response.write(template.render(context))
Пример #40
0
def parse_recipes(response, data={}):
  recipes = []

  items = microdata.get_items(response.body)


  for item in items:
    #log.msg(item.json(), level=log.DEBUG)

    recipe = {}

    if item.itemtype == [URI("http://data-vocabulary.org/Recipe")]:
      recipe = handle_data_vocab(item, data)
    elif item.itemtype == [URI("http://schema.org/Recipe")]:
      recipe = handle_schema_org(item, data)
    else:
      log.msg('could not determine microdata type', level=log.ERROR)

    if 'image' in recipe:
      img = recipe['image']
      fb_image = extract_facebook_images(response)

      if type(img) is str or type(img) is unicode and img.startswith('//'):
        recipe['image'] = extract_facebook_images(response)
      elif type(img) is URI and img.string.startswith('//'):
        recipe['image'] = extract_facebook_images(response)

      # favor facebook image
      if img != fb_image and fb_image is not None:
        recipe['image'] = fb_image;

      if recipe['image'] is not None and recipe['image'].startswith('//'):
        recipe['image'] = 'http:' + recipe['image']

    recipe['source'] = data['source']
    recipes.append(recipe)

  return recipes
Пример #41
0
def parse_page(page_dump):
    '''
    Parse page
    '''

    parsed_microdata = microdata.get_items(page_dump)
    if parsed_microdata:
        items =[i for i in parsed_microdata  if 'name' in i.props]
        
        if items:
            item = items[0]
        else:
            return (None,'')
            print "No items with property 'name'"
    else:
        print "Empty microdata"
        return (None,'')
    try:
        soup = BeautifulSoup(page_dump)
        price = extract_price(soup)
    except Exception,e:
        logging.debug("Can't extract price for %s'", item.name.strip())
        price = None
Пример #42
0
    def consumeData(self, data):
        """
        Parse the microdata into structured data
        """
        ret = []

        soup = BeautifulSoup(StringIO(data))
        ingredientses = soup.find_all(None, itemprop='ingredients')
        for ing in ingredientses:
            separateByClass(soup, ing, "ingredient")
            separateByTag(soup, ing, ['br', 'tr', 'li'])
        instructionses = soup.find_all(None, itemprop="recipeInstructions")
        for ins in instructionses:
            separateByClass(soup, ins, "instruction")
            separateByTag(soup, ins, ['br', 'tr', 'li'])
        workingDocument = StringIO(soup.encode('utf-8'))

        items = microdata.get_items(workingDocument)
        for i in items:
            for typ in i.itemtype:
                if typ.string == MICROFORMAT_RECIPE:
                    ret.append(i.json())
                    break
        return map(json.loads, ret)
Пример #43
0
def get_all_program_urls():
    all_program_urls = []

    page = 1
    while True:
        url = f"https://www.rtp.pt/play/bg_l_pg/?listtype=az&page={page}&type=all"

        logging.info(f"Fetching {url}")
        r = requests.get(url)
        r.raise_for_status()

        items = microdata.get_items(r.text)
        if len(items) == 0:
            break

        for item in items:
            assert item.itemtype[0] == microdata.URI(
                "http://schema.org/VideoObject")
            program_url = urljoin("https://www.rtp.pt/", item.url.string)
            all_program_urls.append(program_url)

        page += 1

    return all_program_urls
Пример #44
0
    def bookmarklet(self, request): 
        """
        Fetches the recipe for the url, saves the recipe, and returns a response to the chrome extension 
        """
        def returnResponse(status, recipes, message): 
            """
            Return the appropriate data structure to the http response 
            """
            data = {'status': status, 
                    'recipes': recipes, 
                    'message': message} 
            defer.returnValue(json.dumps(data)) 

        userEmail = self.user(request).email
        if not userEmail: 
            returnResponse(status="error", recipes=[], message=ResponseMsg.not_logged_in)

        url = request.args['uri'][0]
        pageSource = yield treq.get(url).addCallback(treq.content)
        
        items = microdata.get_items(pageSource)
        recipeSaved = []

        for i in items: 
            itemTypeArray = [x.string for x in i.itemtype] 
            if RECIPE_SCHEMA in itemTypeArray: 
                recipe = i
                saveItem = Recipe.fromMicrodata(recipe, userEmail)
                Recipe.saveOnlyOnce(saveItem)
                recipeSaved.append({"name": saveItem.name, "urlKey": saveItem.urlKey}) 
                break 
        
        if len(recipeSaved) == 0:
            returnResponse(status="error", recipes=[], message=ResponseMsg.no_recipe) 

        returnResponse(status="ok", recipes=recipeSaved, message=ResponseMsg.blank)
Пример #45
0
    def consumeData(self, data):
        """
        Parse the microdata into structured data
        """
        ret = []

        soup = BeautifulSoup(StringIO(data))
        ingredientses = soup.find_all(None, itemprop='ingredients')
        for ing in ingredientses:
            separateByClass(soup, ing, "ingredient")
            separateByTag(soup, ing, ['br', 'tr', 'li'])
        instructionses = soup.find_all(None, itemprop="recipeInstructions")
        for ins in instructionses:
            separateByClass(soup, ins, "instruction")
            separateByTag(soup, ins, ['br', 'tr', 'li'])
        workingDocument = StringIO(soup.encode('utf-8'))

        items = microdata.get_items(workingDocument)
        for i in items:
            for typ in i.itemtype:
                if typ.string == MICROFORMAT_RECIPE:
                    ret.append(i.json())
                    break
        return map(json.loads, ret)
Пример #46
0
 def items_from_str(self, html_str):
     self.items = []
     self.items = microdata.get_items(html_str)
     self.inspect_items()
Пример #47
0
	def items_from_str(self, html_str):
		self.items = []
		self.items = microdata.get_items(html_str)
		self.inspect_items()
Пример #48
0
def get_microdata_author(author, instance):

    try:
        author_url = author.props['url'][0].string

    except:
        LOGGER.error(u'Microdata author has no URL?!?')
        return None

    if author_url.startswith(u'/'):
        # make a full absolute URL, and let things flow.
        try:
            proto, host_and_port, remaining = split_url(instance.url)

        except:
            LOGGER.error(u'schema.org-extractor: could not split “%s” '
                         u'to get schema/host parts, author_url “%s” '
                         u'could be unusable.',
                         instance.url, author_url)

        else:
            author_url = '{0}://{1}{2}'.format(proto, host_and_port, author_url)

    if not author_url.startswith(u'http'):
        # We already have a full name.
        return author_url

    response = requests.get(author_url)
    response.encoding = detect_encoding_from_requests_response(response)

    try:
        items = microdata.get_items(response.text.encode('utf-8'))

    except:
        LOGGER.warning(u'schema.org-extractor: could not extract author '
                       u'microdata from %s', author_url)
        return author_url

    author = {}

    for item in items:
        schema_properties = item.props

        email = schema_properties.get('email', None)

        if email is not None:
            # microdata items are always lists…
            author['email'] = email[0]

        name = schema_properties.get('name', None)

        if name is not None:
            # microdata items are always lists…
            author['name'] = name[0]

        if item.type == 'http://schema.org/Person':
            familly_name = schema_properties.get('famillyName', None)
            given_name = schema_properties.get('givenName', None)

            if given_name is not None and familly_name is not None:
                # intended overwrite
                author['name'] = u'{0} {1}'.format(given_name, familly_name)

        # implicit: elif item.type == 'http://schema.org/Organization':
        # But we already have all the needed data.

    if bool(author):
        return author

    LOGGER.warning(u'schema.org-extractor: no Person/Organization found '
                   u'in author page %s.', author_url)
    return author_url
Пример #49
0
output_scr = open("/tmp/get_digikey_data.scr", "w+")

if len(sys.argv) != 2:
    print "error: invalid number of inputs"
    print "usage: python get_digikey_data.py [url]"
    sys.exit(1)

url = sys.argv[1]
headers = { 'User-Agent' : 'Mozilla/5.0' }
postdata = None
#if not url.endswith(".html"):
#    url += ".html"
try:
    req = urllib2.Request(url, postdata, headers)
    data = urllib2.urlopen(req).read()
    items = microdata.get_items(data)
except:
    print "error: invalid url or unable to connect"
    sys.exit(1)

d = next(item for item in items if item.itemtype[0] == microdata.URI("http://schema.org/WebPage")).json_dict()
#d = item.json_dict()
DIST_NAME = "Digi-Key"
DIST_PN = d['properties']['mainEntity'][0]['properties']['productID'][0][4:].encode('ascii','ignore').strip()
MFG_NAME = d['properties']['mainEntity'][0]['properties']['manufacturer'][0].encode('ascii','ignore').strip()
MFG_PN = d['properties']['mainEntity'][0]['properties']['model'][0].encode('ascii','ignore').strip()
DESC = d['properties']['mainEntity'][0]['properties']['description'][0].encode('ascii','ignore').strip().replace('\n', '<br>')
print "DIST_NAME: " + DIST_NAME
print "DIST_PN: " + DIST_PN
print "MFG_NAME: " + MFG_NAME
print "MFG_PN: " + MFG_PN
	elif re.search(r"^\d+$", v):
		return int(v)
	else:
		return v

defaults = {}
if args.add != None:
	for p in args.add:
		n, v = p.split("=", 1)
		defaults[n] = parse_value(v)
# print (defaults)
# sys.exit()

if args.output == "-":
	out = sys.stdout
else:
	out = open(args.output, "w")


data = {}
data['items'] = items = []
for i in args.input:
	with open(i) as f:
	    for item in microdata.get_items(f):
	        for n, v in defaults.items():
	        	item.set(n, v)
	        items.append(item.json_dict())

print(json.dumps(data, indent=2), file=out)

Пример #51
0
        chunks.append(_text(child))
    return ''.join(chunks)

if __name__ == "__main__":
    import urllib
    if len(sys.argv) < 2:
        print "Usage: %s URL [...]" % sys.argv[0]
        sys.exit(1)

    for url in sys.argv[1:]:
        sys.stderr.write(url + "\n")

        microdata = {}
        microdata['items'] = items = []

        for item in get_items(urllib.urlopen(url)):
            items.append(item.json_dict())

        print json.dumps(microdata, indent=2)

########NEW FILE########
__FILENAME__ = test
try:
    import json
except ImportError:
    import simplejson as json

import unittest

from microdata import get_items, Item, URI
Пример #52
0
def process(self, instance, parameters=None,
            verbose=True, commit=True, **kwargs):
    """ See source code. """

    CONTENT_TYPES = models.CONTENT_TYPES

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    # Only used in accepts() code.
    # repair = parameters.get('repair', False)

    if instance.content_type == CONTENT_TYPES.HTML:
        html_to_work_on = instance.content

    else:
        # The existence of this has already been tested in accepts().
        # We cannot run process() if the instance is not HTML or not
        # repairing it with a known HTML history version.
        html_to_work_on = instance.history.filter(
            content_type=CONTENT_TYPES.HTML).earliest('history_date').content

    try:
        # The microdata parser expects an utf-8 encoded string… too bad.
        items = microdata.get_items(html_to_work_on.encode('utf-8'))

    except:
        LOGGER.warning(u'schema.org-extractor: could not extract microdata '
                       u'from %s %s', instance_name, instance_id)
        return

    need_save = False

    # ————————————————————————————————————————————————————————————————— Extract

    attributes = OrderedDict()

    for item in items:
        schema_properties = item.props

        # LOGGER.info(u'item %s', item.json())

        # Common attributes to all types we handle in 1flow.

        name = schema_properties.get('name', None)

        # Do not overwrite with a less specific value if
        # name was already set via 'Article::headline'.
        if name is not None and attributes.get('name', None) is not None:
            attributes['name'] = get_property(name)

        date_published = schema_properties.get('datePublished', None)

        if date_published is not None:
            attributes['date_published'] = get_property(date_published)

        excerpt = schema_properties.get('description', None)

        if excerpt is not None:
            attributes['excerpt'] = get_property(excerpt)

        tags = schema_properties.get('keywords', None)

        if tags is not None:
            attributes['tags'] = extract_tags(tags)

        image_url = schema_properties.get('thumbnailUrl', None)

        if image_url is not None:
            attributes['image_url'] = get_property(image_url)

        authors = schema_properties.get('author', None)

        # Author can be a link to the author page, which
        # will give us a Person or Organization schema.
        if authors is not None:
            found_authors = get_microdata_authors(authors, instance)
            if found_authors:
                attributes['authors'] = found_authors

        genre = schema_properties.get('genre', None)

        if genre is not None:
            if 'tags' not in attributes:
                attributes['tags'] = []
            for one_genre in genre:
                attributes['tags'].extend(extract_tags(one_genre))

        if item.type == 'http://schema.org/VideoObject':

            if instance.content_type != CONTENT_TYPES.VIDEO:
                instance.content_type = CONTENT_TYPES.VIDEO
                need_save = True

                LOGGER.info(u'schema.org-extractor: Set %s %s content type '
                            u'to VIDEO.', instance_name, instance_id)

        elif item.type in (
            'http://schema.org/Article',
            'http://schema.org/NewsArticle',
            'http://schema.org/TechArticle',

            'http://schema.org/BlogPosting',
            'http://schema.org/WebPage',
            'http://schema.org/CreativeWork',
        ):

            # HeadLine overwrites name, it's more specific.
            attributes['name'] = get_property(
                schema_properties.get('headline', None))
            attributes['language'] = get_property(
                schema_properties.get('inLanguage', None))
            attributes['word_count'] = get_property(
                schema_properties.get('wordCount', None))

            creators = schema_properties.get('creator', None)

            # Author can be a link to the creator page, which
            # will give us a Person or Organization schema.
            if creators is not None:
                creators = get_microdata_authors(creators, instance)

                if creators:
                    if 'authors' in attributes:
                        attributes['authors'].extend(creators)

                    else:
                        attributes['authors'] = creators

            # TODO:
            # citation
            # comment
            # articleBody → content
            # articleSection → Tags
            #
            # News:
            # dateline → ?
            #
            # Tech:
            # dependencies
            # proficiencyLevel
            #
            # WebPage:
            # specialy → ?
            # significantLink → crawl ?
            # reviewedBy → ?
            # lastReviewed → ?
            # relatedLink → ?
            # primaryImageOfPage

    # —————————————————————————————————————————————————————— Transform & assign
    # turn attributes into their python / 1flow native-internals formats.

    if attributes.get('date_published', None) is not None:
        try:
            attributes['date_published'] = datetime(*datetime_extended_parser(
                attributes['date_published'])[:6])

        except:
            LOGGER.exception(u'schema.org-extractor: unparseable date “%s”',
                             attributes['date_published'])

            # Be sure we don't try to use it below.
            attributes['date_published'] = None

    if attributes.get('language', None) is not None:
        try:
            attributes['language'] = models.Language.get_by_code(
                attributes['language'])

        except:
            LOGGER.exception(u'schema.org-extractor: unable to get '
                             u'language “%s”', attributes['language'])

            # Be sure we don't try to use it below.
            attributes['language'] = None

    if attributes.get('word_count', None) is not None:
        attributes['word_count'] = int(attributes['word_count'])

    if attributes.get('tags', None) is not None:
        # We pop() tags to avoid trying to setattr() it below.
        tags = models.SimpleTag.get_tags_set(attributes.pop('tags'),
                                             origin=instance)
        instance.tags.add(*tags)

        if verbose:
            LOGGER.info(u'schema.org-extractor: added tags %s to %s %s.',
                        u', '.join(tag.name for tag in tags),
                        instance_name, instance_id)

    if attributes.get('authors', None) is not None:
        # We pop() tags to avoid trying to setattr() it below.
        authors = attributes.pop('authors')

        # LOGGER.info(authors)

        # This will implicitely add() the author to the instance.
        authors = models.Author.get_authors_from_name_emails_and_article(
            authors, origin_article=instance)

        # LOGGER.info(authors)

        LOGGER.info(u'schema.org-extractor: added author(s) %s to %s %s.',
                    u', '.join(unicode(a) for a in authors),
                    instance_name, instance_id)

    # if verbose:
    #     LOGGER.debug(u'schema.org-extractor: %s', attributes)

    for attribute, value in attributes.items():
        if value is None:
            continue

        if getattr(instance, attribute) is None:
            setattr(instance, attribute, value)

            need_save = True

            if verbose:
                LOGGER.info(u'schema.org-extractor: Set %s %s to %s %s.',
                            attribute, value, instance_name, instance_id)

    if need_save and commit:
        instance.save()
Пример #53
0
	def items_from_URL(self, doc_url):
		self.items = []
		self.items = microdata.get_items(urllib2.urlopen(doc_url).read())
		self.inspect_items()
Пример #54
0
 def test_skip_level(self):
     items = get_items(open("test-data/skip-level.html"))
     self.assertEqual(len(items), 1)
     self.assertEqual(items[0].name, "Jane Doe")