예제 #1
0
    def _extract_metrics(self, page, status_code=200, id=None):
        if status_code != 200:
            if status_code == 404:
                return {}
            else:
                raise (self._get_error(status_code))

        if not "kind" in page:
            raise ProviderContentMalformedError

        json_response = provider._load_json(page)
        this_video_json = json_response["items"][0]

        dict_of_keylists = {
            'youtube:views': ['statistics', 'viewCount'],
            'youtube:likes': ['statistics', 'likeCount'],
            'youtube:dislikes': ['statistics', 'dislikeCount'],
            'youtube:favorites': ['statistics', 'favoriteCount'],
            'youtube:comments': ['statistics', 'commentCount'],
        }

        metrics_dict = provider._extract_from_data_dict(
            this_video_json, dict_of_keylists)

        metrics_dict = provider._metrics_dict_as_ints(metrics_dict)

        return metrics_dict
    def biblio(self, aliases, provider_url_template=None, cache_enabled=True):

        tweet_url = self.get_best_id(aliases)
        biblio_embed_url = self.biblio_template_url % (
            self.tweet_id(tweet_url))
        response = self.http_get(biblio_embed_url)
        data = provider._load_json(response.text)

        biblio_dict = {}
        biblio_dict["repository"] = "Twitter"
        biblio_dict["url"] = tweet_url

        if not data:
            return biblio_dict

        biblio_dict["title"] = u"@{screen_name}".format(
            screen_name=self.screen_name(tweet_url))
        biblio_dict["authors"] = data["author_name"]
        biblio_dict["embed"] = data["html"]
        biblio_dict["embed_url"] = biblio_embed_url
        biblio_dict["account"] = u"@{screen_name}".format(
            screen_name=self.screen_name(tweet_url))
        try:
            tweet_match = re.findall(
                u'<p>(.*?)</p>.*statuses/\d+">(.*?)</a></blockquote>',
                biblio_dict["embed"])
            biblio_dict["tweet_text"] = tweet_match[0][0]
            biblio_dict["date"] = datetime.datetime.strptime(
                tweet_match[0][1], "%B %d, %Y").isoformat()
            biblio_dict["year"] = biblio_dict["date"][0:4]
        except (AttributeError):
            logger.debug("couldn't parse tweet embed {embed}".format(
                embed=biblio_dict["embed"]))

        return biblio_dict
예제 #3
0
    def _extract_biblio(self, page, id=None):

        if not "snippet" in page:
            raise ProviderContentMalformedError

        json_response = provider._load_json(page)
        this_video_json = json_response["items"][0]

        dict_of_keylists = {
            'title': ['snippet', 'title'],
            'channel_title': ['snippet', 'channelTitle'],
            'published_date': ['snippet', 'publishedAt']
        }

        biblio_dict = provider._extract_from_data_dict(this_video_json, dict_of_keylists)

        try:
            biblio_dict["year"] = biblio_dict["published_date"][0:4]
        except KeyError:
            pass

        biblio_dict["url"] = id
        biblio_dict["repository"] = "YouTube"

        return biblio_dict    
    def biblio(self, 
            aliases,
            provider_url_template=None,
            cache_enabled=True):

        tweet_url = self.get_best_id(aliases)
        biblio_embed_url = self.biblio_template_url % (self.tweet_id(tweet_url))
        response = self.http_get(biblio_embed_url)
        data = provider._load_json(response.text)

        biblio_dict = {}
        biblio_dict["repository"] = "Twitter"
        biblio_dict["url"] = tweet_url

        if not data:
            return biblio_dict

        biblio_dict["title"] = u"@{screen_name}".format(screen_name=self.screen_name(tweet_url))
        biblio_dict["authors"] = data["author_name"]
        biblio_dict["embed"] = data["html"]
        biblio_dict["embed_url"] = biblio_embed_url
        biblio_dict["account"] = u"@{screen_name}".format(screen_name=self.screen_name(tweet_url))
        try:
            tweet_match = re.findall(u'<p>(.*?)</p>.*statuses/\d+">(.*?)</a></blockquote>', biblio_dict["embed"])
            biblio_dict["tweet_text"] = tweet_match[0][0]
            biblio_dict["date"] = datetime.datetime.strptime(tweet_match[0][1], "%B %d, %Y").isoformat()
            biblio_dict["year"] = biblio_dict["date"][0:4]
        except (AttributeError):
            logger.debug("couldn't parse tweet embed {embed}".format(
                embed=biblio_dict["embed"]))

        return biblio_dict
예제 #5
0
    def _get_uuid_from_title(self, aliases_dict, page):
        data = provider._load_json(page)
        doi = aliases_dict["doi"][0]
        biblio = aliases_dict["biblio"][0]
        for mendeley_record in data["documents"]:
            if mendeley_record["doi"] == doi:
                uuid = mendeley_record["uuid"]
                return uuid
            else:
                # more complicated.  Try to match title and year.
                try:
                    mendeley_title = self.remove_punctuation(mendeley_record["title"]).lower()
                    aliases_title = self.remove_punctuation(biblio["title"]).lower()
                except (TypeError, KeyError, AttributeError):
                    continue  # nothing to see here.  Skip to next record

                if mendeley_title == aliases_title:
                    if mendeley_record["year"] == biblio["year"]:
                        # check if author name in common. if not, yell, but continue anyway
                        first_mendeley_surname = mendeley_record["authors"][0]["surname"]
                        has_matching_authors = first_mendeley_surname.lower() in biblio["authors"].lower()
                        if not has_matching_authors:
                            logger.warning("Mendeley: NO MATCHING AUTHORS between %s and %s" %(
                                first_mendeley_surname, biblio["authors"]))
                        # but return it anyway
                        uuid = mendeley_record["uuid"]
                        return uuid
                    else:
                        logger.debug("Mendeley: years don't match %s and %s" %(
                            str(mendeley_record["year"]), str(biblio["year"])))
                else:
                    logger.debug("Mendeley: titles don't match %s and %s" %(
                        self.remove_punctuation(mendeley_record["title"]), self.remove_punctuation(biblio["title"])))
        # no joy
        return None
예제 #6
0
 def _extract_provenance_url(self, page, status_code=200, id=None):
     data = provider._load_json(page)
     try:
         provenance_url = data['mendeley_url']
     except KeyError:
         provenance_url = ""
     return provenance_url        
예제 #7
0
    def _extract_metrics(self, page, status_code=200, id=None):        
        if status_code != 200:
            if status_code == 404:
                return {}
            else:
                raise(self._get_error(status_code))

        if not "snippet" in page:
            raise ProviderContentMalformedError

        json_response = provider._load_json(page)
        this_video_json = json_response["items"][0]

        dict_of_keylists = {
            'youtube:views' : ['statistics', 'viewCount'],
            'youtube:likes' : ['statistics', 'likeCount'],
            'youtube:dislikes' : ['statistics', 'dislikeCount'],
            'youtube:favorites' : ['statistics', 'favoriteCount'],
            'youtube:comments' : ['statistics', 'commentCount'],
        }

        metrics_dict = provider._extract_from_data_dict(this_video_json, dict_of_keylists)

        metrics_dict = provider._metrics_dict_as_ints(metrics_dict)

        return metrics_dict
예제 #8
0
    def top_tweeted_urls(self,
                         query,
                         query_type="site",
                         number_to_return=10,
                         pages=5):

        if query_type == "site":
            query = re.sub("http(s?)://", "", query.lower())
        elif query_type in ["twitter", "tweets_about"]:
            query = query.replace("@", "")

        template_url = self.top_tweeted_url_templates[query_type]
        urls = [template_url % (query, i) for i in range(1, pages + 1)]
        responses = self.http_get_multiple(urls)
        tweeted_entries = []
        for url in responses:
            tweeted_entries += provider._load_json(
                responses[url].text)["response"]["list"]
        sorted_list = sorted(tweeted_entries,
                             key=itemgetter('hits'),
                             reverse=True)

        top_tweeted_urls = []  #needs to be ordered

        for entry in sorted_list:
            url = self.get_url_from_entry(query, entry, query_type)
            if url and (url not in top_tweeted_urls):
                top_tweeted_urls.append(url)
        return (top_tweeted_urls[0:number_to_return])
예제 #9
0
    def _extract_members(self, page, query_string=None):
        if 'orcid-profile' not in page:
            raise ProviderContentMalformedError("Content does not contain expected text")

        data = provider._load_json(page)
        dois = []
        try:
            orcid_works = data["orcid-profile"]["orcid-activities"]["orcid-works"]["orcid-work"]
        except KeyError:
            return []

        for work in orcid_works:
            try:
                ids = work["work-external-identifiers"]["work-external-identifier"]
                for myid in ids:
                    if myid['work-external-identifier-type'] == "DOI":
                        doi = myid['work-external-identifier-id']['value']
                        dois += [doi]
            except KeyError:
                logger.info("no external identifiers for %s, so skipping" %(str(work)))
                pass

        if not dois:
            raise ProviderItemNotFoundError

        members = [("doi", doi) for doi in list(set(dois))]
        return(members)
예제 #10
0
    def _extract_biblio(self, page, id=None):

        if not "snippet" in page:
            raise ProviderContentMalformedError

        json_response = provider._load_json(page)
        this_video_json = json_response["items"][0]

        dict_of_keylists = {
            'title': ['snippet', 'title'],
            'channel_title': ['snippet', 'channelTitle'],
            'published_date': ['snippet', 'publishedAt']
        }

        biblio_dict = provider._extract_from_data_dict(this_video_json, dict_of_keylists)

        try:
            biblio_dict["year"] = biblio_dict["published_date"][0:4]
        except KeyError:
            pass

        biblio_dict["url"] = id
        biblio_dict["repository"] = "YouTube"

        return biblio_dict    
예제 #11
0
 def _extract_provenance_url(self, page, status_code=200, id=None):
     data = provider._load_json(page)
     try:
         provenance_url = data['mendeley_url']
     except KeyError:
         provenance_url = ""
     return provenance_url
예제 #12
0
    def _extract_members(self, page, query_string=None):
        if 'orcid-profile' not in page:
            raise ProviderContentMalformedError("Content does not contain expected text")

        data = provider._load_json(page)
        members = []
        try:
            orcid_works = data["orcid-profile"]["orcid-activities"]["orcid-works"]["orcid-work"]
        except KeyError:
            return []

        for work in orcid_works:
            new_member = None
            try:
                ids = work["work-external-identifiers"]["work-external-identifier"]

                for myid in ids:
                    if myid['work-external-identifier-type'] == "DOI":
                        new_member = ("doi", myid['work-external-identifier-id']['value'])
                    if myid['work-external-identifier-type'] == "PMID":
                        new_member = ("pmid", myid['work-external-identifier-id']['value'])

            except KeyError:
                logger.info(u"no external identifiers, try saving whole citation")
                biblio = self._parse_orcid_work(work)
                new_member = ("biblio", biblio)

            if new_member:
                members += [new_member]    

        if not members:
            raise ProviderItemNotFoundError

        return(members)
예제 #13
0
 def _extract_members(self, page, query_string=None):
     data = provider._load_json(page)
     dois = [
         item["DOI"].replace("http://dx.doi.org/", "")
         for item in data["items"]
     ]
     doi_aliases = [("doi", doi) for doi in dois]
     return (doi_aliases)
예제 #14
0
 def _extract_item(self, page, id):
     data = provider._load_json(page)
     if not data:
         return {}
     item = data["items"][0]
     if item["doi"] == self._get_templated_url(self.provenance_url_template, id, "provenance"):
         return item
     else:
         return {}
예제 #15
0
 def _extract_relevant_record(self, fullpage, id):
     data = provider._load_json(fullpage)
     response = None
     try:
         response = data["search-results"]["entry"][0]
     except (KeyError, ValueError):
         # not in Scopus database
         return None
     return response
예제 #16
0
 def _extract_figshare_record(self, page, id):
     data = provider._load_json(page)
     if not data:
         return {}
     item = data["items"][0]
     if str(item["article_id"]) in id:
         return item
     else:
         return {}
예제 #17
0
 def _extract_relevant_record(self, fullpage, id):
     data = provider._load_json(fullpage)
     response = None
     try:
         response = data["search-results"]["entry"][0]
     except (KeyError, ValueError):
         # not in Scopus database
         return None
     return response
예제 #18
0
 def _extract_figshare_record(self, page, id):
     data = provider._load_json(page)
     if not data:
         return {}
     item = data["items"][0]
     if str(item["article_id"]) in id:
         return item
     else:
         return {}
예제 #19
0
    def _extract_members(self, page, account_name): 
        members = []
        # add repositories from account
        data = provider._load_json(page)
        review_urls = [review["_id"]["url"] for review in data["reviews"] 
                            if review["title"]!="An undisclosed article"]
        members += [("url", url) for url in review_urls]

        return(members)
예제 #20
0
    def _get_json(self, fullpage):
        try:
            # extract json from inside the first and last parens
            # from http://codereview.stackexchange.com/questions/2561/converting-jsonp-to-json-is-this-regex-correct
            page = fullpage[fullpage.index("(") + 1:fullpage.rindex(")")]
        except (AttributeError, ValueError):
            raise ProviderContentMalformedError()

        data = provider._load_json(page)
        return (data)
예제 #21
0
    def _get_json(self, fullpage):
        try:
            # extract json from inside the first and last parens
            # from http://codereview.stackexchange.com/questions/2561/converting-jsonp-to-json-is-this-regex-correct
            page = fullpage[ fullpage.index("(")+1 : fullpage.rindex(")") ]
        except (AttributeError, ValueError):
            raise ProviderContentMalformedError()

        data = provider._load_json(page)
        return(data)
예제 #22
0
    def _extract_members(self, page, account_name): 
        members = []
        # add repositories from account
        data = provider._load_json(page)
        repos = [repo["name"] for repo in data]
        members += [("url", self.repo_url_template %(account_name, repo)) for repo in list(set(repos))]

        # also add account product!
        members += [("url", self.account_url_template %(account_name))]

        return(members)
예제 #23
0
    def _extract_members(self, page, account_name):
        members = []
        # add repositories from account
        data = provider._load_json(page)
        review_urls = [
            review["_id"]["url"] for review in data["reviews"]
            if review["title"] != "An undisclosed article"
        ]
        members += [("url", url) for url in review_urls]

        return (members)
예제 #24
0
    def _extract_members(self, page, account_name):
        members = []
        # add repositories from account
        data = provider._load_json(page)
        repos = [repo["name"] for repo in data]
        members += [("url", self.repo_url_template % (account_name, repo))
                    for repo in list(set(repos))]

        # also add account product!
        members += [("url", self.account_url_template % (account_name))]

        return (members)
예제 #25
0
    def _extract_metrics(self, page, status_code=200, id=None):
        metrics_dict = {}
        if status_code != 200:
            if status_code == 404:
                return {}
            else:
                raise (self._get_error(status_code))

        data = provider._load_json(page)
        number_of_bookmarks = len(data)
        if number_of_bookmarks:
            metrics_dict = {'delicious:bookmarks': number_of_bookmarks}

        return metrics_dict
예제 #26
0
    def member_items(self, 
            account_name,
            provider_url_template=None, 
            cache_enabled=True):

        if not self.provides_members:
            raise NotImplementedError()

        self.logger.debug(u"%s getting member_items for %s" % (self.provider_name, account_name))

        if not provider_url_template:
            provider_url_template = self.member_items_url_template

        members = []
        figshare_userid = self.get_figshare_userid_from_author_url(account_name)
        if not figshare_userid:
            raise ProviderContentMalformedError("no figshare user id found")

        next_page = 1
        while next_page:

            url = provider_url_template % (figshare_userid, next_page)
            
            # try to get a response from the data provider  
            response = self.http_get(url, cache_enabled=cache_enabled)

            if response.status_code != 200:
                self.logger.info(u"%s status_code=%i" 
                    % (self.provider_name, response.status_code))            
                if response.status_code == 404:
                    raise ProviderItemNotFoundError
                elif response.status_code == 303: #redirect
                    pass                
                else:
                    self._get_error(response.status_code, response)

            # extract the member ids
            number_of_items_per_page = 10 #figshare default
            try:
                page = response.text
                data = provider._load_json(page)
                if data["items_found"] > next_page*number_of_items_per_page:
                    next_page += 1
                else:
                    next_page = None
                members += self._extract_members(page, account_name)
            except (AttributeError, TypeError):
                next_page = None

        return(members)
예제 #27
0
    def member_items(self,
                     account_name,
                     provider_url_template=None,
                     cache_enabled=True):

        if not self.provides_members:
            raise NotImplementedError()

        self.logger.debug(u"%s getting member_items for %s" %
                          (self.provider_name, account_name))

        if not provider_url_template:
            provider_url_template = self.member_items_url_template

        figshare_userid = self.get_figshare_userid_from_author_url(
            account_name)
        next_page = 1
        members = []
        while next_page:

            url = provider_url_template % (figshare_userid, next_page)

            # try to get a response from the data provider
            response = self.http_get(url, cache_enabled=cache_enabled)

            if response.status_code != 200:
                self.logger.info(u"%s status_code=%i" %
                                 (self.provider_name, response.status_code))
                if response.status_code == 404:
                    raise ProviderItemNotFoundError
                elif response.status_code == 303:  #redirect
                    pass
                else:
                    self._get_error(response.status_code, response)

            # extract the member ids
            number_of_items_per_page = 10  #figshare default
            try:
                page = response.text
                data = provider._load_json(page)
                if data["items_found"] > next_page * number_of_items_per_page:
                    next_page += 1
                else:
                    next_page = None
                members += self._extract_members(page, account_name)
            except (AttributeError, TypeError):
                next_page = None

        return (members)
예제 #28
0
    def _extract_members(self, page, query_string=None):
        if 'orcid-profile' not in page:
            raise ProviderContentMalformedError(
                "Content does not contain expected text")

        data = provider._load_json(page)
        members = []
        try:
            orcid_works = data["orcid-profile"]["orcid-activities"][
                "orcid-works"]["orcid-work"]
        except KeyError:
            return []

        for work in orcid_works:
            new_member = None
            try:
                ids = work["work-external-identifiers"][
                    "work-external-identifier"]

                for myid in ids:
                    if myid['work-external-identifier-type'] == "DOI":
                        doi = myid['work-external-identifier-id']['value']
                        doi = crossref.clean_doi(doi)
                        if doi:
                            new_member = ("doi", doi)
                    if myid['work-external-identifier-type'] == "PMID":
                        new_member = (
                            "pmid",
                            myid['work-external-identifier-id']['value'])
            except KeyError:
                pass

            if not new_member:
                logger.info(
                    u"no external identifiers, try saving whole citation for {orcid}"
                    .format(orcid=query_string))
                biblio = self._parse_orcid_work(work)
                new_member = ("biblio", biblio)

            if new_member:
                members += [new_member]

        if not members:
            raise ProviderItemNotFoundError

        logger.info(u"returning {n} members for {orcid}".format(
            n=len(members), orcid=query_string))

        return (members)
예제 #29
0
    def _extract_metrics(self, page, status_code=200, id=None):
        metrics_dict = {}
        if status_code != 200:
            if status_code == 404:
                return {}
            else:
                raise(self._get_error(status_code))

        data = provider._load_json(page)
        number_of_bookmarks = len(data)
        if number_of_bookmarks:
            metrics_dict = {
                'delicious:bookmarks' : number_of_bookmarks
            }

        return metrics_dict
예제 #30
0
    def _extract_metrics(self, page, status_code=200, id=None):
        if status_code != 200:
            if status_code == 404:
                return {}
            else:
                raise(self._get_error(status_code))
        data = provider._load_json(page)

        metrics_dict = {}
        for section in data["article"]["source"]:
            source = provider._lookup_json(section, ["source"])
            if (source == "Counter"):
                #drilldown_url = provider._lookup_json(section["citations"][0], ["citation", "uri"])
                html_sum = self._aggregate_monthly_stats("html_views", section)
                metrics_dict["html_views"] = html_sum
                pdf_sum = self._aggregate_monthly_stats("pdf_views", section)
                metrics_dict["pdf_views"] = pdf_sum
            elif (source == "PubMed Central Usage Stats"):
                #drilldown_url = provider._lookup_json(section["citations"][0], ["citation", "uri"])
                try:
                    first_month_stats = section["events"][0]
                except KeyError:
                    logger.debug("%20s no first_month_stats for %s" % (self.provider_name, id))
                    first_month_stats = []
                for metric_name in first_month_stats:
                    normalized_metric_name = "pmc_" + self._normalize_source(metric_name)
                    if (normalized_metric_name in self.static_meta_dict.keys()):
                        total = self._aggregate_monthly_stats(metric_name, section)
                        if total:
                            metrics_dict[normalized_metric_name] = total
            elif (self._normalize_source(source) in self.static_meta_dict.keys()):
                total = provider._lookup_json(section, ["count"])
                if total:
                    #drilldown_url = provider._lookup_json(section, ["public_url"])
                    #if not drilldown_url:
                    #    drilldown_url = ""
                    metrics_dict[source] = total

        rekeyed_dict = dict(("plosalm:"+self._normalize_source(k),v) for (k,v) in metrics_dict.iteritems())

        return rekeyed_dict
예제 #31
0
    def biblio(self, 
            aliases,
            provider_url_template=None,
            cache_enabled=True):

        nid = self.get_best_id(aliases)
        url = self.biblio_template_url % (self.tweet_id(nid))
        response = self.http_get(url)
        data = provider._load_json(response.text)

        biblio_dict = {}
        biblio_dict["repository"] = "Twitter"

        if not data:
            return biblio_dict

        biblio_dict["title"] = u"@{screen_name}".format(screen_name=self.screen_name(nid))
        biblio_dict["authors"] = data["author_name"]
        biblio_dict["embed"] = data["html"]

        return biblio_dict
예제 #32
0
    def _extract_metrics(self, page, status_code=200, id=None):
        if status_code != 200:
            if status_code == 404:
                return {}
            else:
                raise(self._get_error(status_code))

        if not "sources" in page:
            raise ProviderContentMalformedError

        json_response = provider._load_json(page)
        this_article = json_response[0]["sources"][0]["metrics"]

        dict_of_keylists = {
            'plosalm:html_views' : ['html'],
            'plosalm:pdf_views' : ['pdf']
        }

        metrics_dict = provider._extract_from_data_dict(this_article, dict_of_keylists)

        return metrics_dict
예제 #33
0
    def _extract_metrics(self, page, status_code=200, id=None):
        if status_code != 200:
            if status_code == 404:
                return {}
            else:
                raise(self._get_error(status_code))

        if not "sources" in page:
            raise ProviderContentMalformedError

        json_response = provider._load_json(page)
        this_article = json_response[0]["sources"][0]["metrics"]

        dict_of_keylists = {
            'plosalm:html_views' : ['html'],
            'plosalm:pdf_views' : ['pdf']
        }

        metrics_dict = provider._extract_from_data_dict(this_article, dict_of_keylists)

        return metrics_dict
예제 #34
0
    def _extract_metrics(self, page, status_code=200, id=None):
        if status_code != 200:
            if status_code == 404:
                return {}
            else:
                raise (self._get_error(status_code))

        if not "user_id" in page:
            raise ProviderContentMalformedError

        json_response = provider._load_json(page)
        this_video_json = json_response[0]

        dict_of_keylists = {
            "vimeo:plays": ["stats_number_of_plays"],
            "vimeo:likes": ["stats_number_of_likes"],
            "vimeo:comments": ["stats_number_of_comments"],
        }

        metrics_dict = provider._extract_from_data_dict(this_video_json, dict_of_keylists)

        return metrics_dict
예제 #35
0
    def _extract_biblio(self, page, id=None):

        json_response = provider._load_json(page)
        this_video_json = json_response[0]

        dict_of_keylists = {
            'title':        ['title'],
            'authors':      ['user_name'],
            'published_date': ['upload_date'],
            'url':          ['url']
        }

        biblio_dict = provider._extract_from_data_dict(this_video_json, dict_of_keylists)

        try:
            biblio_dict["year"] = biblio_dict["published_date"][0:4]
        except KeyError:
            pass

        biblio_dict["repository"] = "Vimeo"

        return biblio_dict    
예제 #36
0
    def _extract_metrics(self, page, status_code=200, id=None):        
        if status_code != 200:
            if status_code == 404:
                return {}
            else:
                raise(self._get_error(status_code))

        if not "user_id" in page:
            raise ProviderContentMalformedError

        json_response = provider._load_json(page)
        this_video_json = json_response[0]

        dict_of_keylists = {
            'vimeo:plays' : ['stats_number_of_plays'],
            'vimeo:likes' : ['stats_number_of_likes'],
            'vimeo:comments' : ['stats_number_of_comments']
        }

        metrics_dict = provider._extract_from_data_dict(this_video_json, dict_of_keylists)

        return metrics_dict
예제 #37
0
    def _extract_biblio(self, page, id=None):

        json_response = provider._load_json(page)
        this_video_json = json_response[0]

        dict_of_keylists = {
            "title": ["title"],
            "authors": ["user_name"],
            "published_date": ["upload_date"],
            "url": ["url"],
        }

        biblio_dict = provider._extract_from_data_dict(this_video_json, dict_of_keylists)

        try:
            biblio_dict["year"] = biblio_dict["published_date"][0:4]
        except KeyError:
            pass

        biblio_dict["repository"] = "Vimeo"

        return biblio_dict
예제 #38
0
    def top_tweeted_urls(self, query, query_type="site", number_to_return=10, pages=5):

        if query_type == "site":
            query = re.sub("http(s?)://", "", query.lower())
        elif query_type in ["twitter", "tweets_about"]:
            query = query.replace("@", "")

        template_url = self.top_tweeted_url_templates[query_type]
        urls = [template_url % (query, i) for i in range(1, pages+1)]
        responses = self.http_get_multiple(urls)
        tweeted_entries = [] 
        for url in responses:
            tweeted_entries += provider._load_json(responses[url].text)["response"]["list"]
        sorted_list = sorted(tweeted_entries, key=itemgetter('hits'), reverse=True) 

        top_tweeted_urls = [] #needs to be ordered

        for entry in sorted_list:
            url = self.get_url_from_entry(query, entry, query_type)
            if url and (url not in top_tweeted_urls):
                top_tweeted_urls.append(url)    
        return(top_tweeted_urls[0:number_to_return])
예제 #39
0
    def _extract_metrics(self, page, status_code=200, id=None):
        if status_code != 200:
            if status_code == 404:
                return {}
            else:
                raise (self._get_error(status_code))

        metrics_dict = {}

        if "hits" in page:
            data = provider._load_json(page)
            hits = [post["hits"] for post in data["response"]["list"]]
            if hits:
                sum_of_hits = sum(hits)
                metrics_dict["topsy:tweets"] = sum_of_hits
        else:
            dict_of_keylists = {
                'topsy:tweets': ['response', 'all'],
                'topsy:influential_tweets': ['response', 'influential']
            }
            metrics_dict = provider._extract_from_json(page, dict_of_keylists)

        return metrics_dict
예제 #40
0
    def _extract_metrics(self, page, status_code=200, id=None):
        if status_code != 200:
            if status_code == 404:
                return {}
            else:
                raise(self._get_error(status_code))

        metrics_dict = {}

        if "hits" in page:
            data = provider._load_json(page)
            hits = [post["hits"] for post in data["response"]["list"]]
            if hits:
                sum_of_hits = sum(hits)
                metrics_dict["topsy:tweets"] = sum_of_hits
        else:
            dict_of_keylists = {
                'topsy:tweets' : ['response', 'all'],
                'topsy:influential_tweets' : ['response', 'influential']
            }
            metrics_dict = provider._extract_from_json(page, dict_of_keylists)

        return metrics_dict
예제 #41
0
 def _extract_members(self, page, query_string=None): 
     data = provider._load_json(page)        
     dois = [item["DOI"].replace("http://dx.doi.org/", "") for item in data["items"]]
     doi_aliases = [("doi", doi) for doi in dois]
     return(doi_aliases)
예제 #42
0
    def _get_uuid_from_title(self, aliases_dict, page):
        data = provider._load_json(page)
        try:
            doi = aliases_dict["doi"][0]
        except KeyError:
            doi = None

        try:
            biblio = aliases_dict["biblio"][0]
        except KeyError:
            biblio = None

        for mendeley_record in data["documents"]:
            if doi and (mendeley_record["doi"] == doi):
                uuid = mendeley_record["uuid"]
                return {"uuid": uuid}
            else:
                # more complicated.  Try to match title and year.
                try:
                    mendeley_title = self.remove_punctuation(mendeley_record["title"]).lower()
                    aliases_title = self.remove_punctuation(biblio["title"]).lower()
                except (TypeError, KeyError, AttributeError):
                    logger.warning(u"Mendeley: NO TITLES for aliases, skipping")
                    continue  # nothing to see here.  Skip to next record

                try:
                    if len(str(biblio["year"])) != 4:
                        logger.warning(u"Mendeley: NO YEAR for aliases, skipping")
                        continue
                except (TypeError, KeyError, AttributeError):
                    logger.warning(u"Mendeley: NO YEAR for aliases, skipping")
                    continue  # nothing to see here.  Skip to next record

                if mendeley_title == aliases_title:
                    if str(mendeley_record["year"]) == str(biblio["year"]):

                        # check if author name in common. if not, yell, but continue anyway
                        first_mendeley_surname = mendeley_record["authors"][0]["surname"]
                        has_matching_authors = first_mendeley_surname.lower() in biblio["authors"].lower()
                        if not has_matching_authors:
                            logger.warning(
                                u"Mendeley: NO MATCHING AUTHORS between %s and %s"
                                % (first_mendeley_surname, biblio["authors"])
                            )
                        # but return it anyway
                        response = {}
                        for id_type in ["uuid", "mendeley_url", "doi", "pmid"]:
                            try:
                                if mendeley_record[id_type]:
                                    if id_type == "mendeley_url":
                                        response["url"] = mendeley_record[id_type]
                                    else:
                                        response[id_type] = mendeley_record[id_type]
                            except KeyError:
                                pass
                        return response
                    else:
                        logger.debug(
                            u"Mendeley: years don't match %s and %s"
                            % (str(mendeley_record["year"]), str(biblio["year"]))
                        )
                else:
                    logger.debug(
                        u"Mendeley: titles don't match /biblio_print %s and %s"
                        % (self.remove_punctuation(mendeley_record["title"]), self.remove_punctuation(biblio["title"]))
                    )
        # no joy
        return None
예제 #43
0
    def _get_uuid_from_title(self, aliases_dict, page):
        data = provider._load_json(page)
        try:
            doi = aliases_dict["doi"][0]
        except KeyError:
            doi = None

        try:
            biblio = aliases_dict["biblio"][0]
        except KeyError:
            biblio = None

        for mendeley_record in data["documents"]:
            if doi and (mendeley_record["doi"] == doi):
                uuid = mendeley_record["uuid"]
                return {"uuid": uuid}
            else:
                # more complicated.  Try to match title and year.
                try:
                    mendeley_title = self.remove_punctuation(
                        mendeley_record["title"]).lower()
                    aliases_title = self.remove_punctuation(
                        biblio["title"]).lower()
                except (TypeError, KeyError, AttributeError):
                    logger.warning(
                        u"Mendeley: NO TITLES for aliases, skipping")
                    continue  # nothing to see here.  Skip to next record

                try:
                    if (len(str(biblio["year"])) != 4):
                        logger.warning(
                            u"Mendeley: NO YEAR for aliases, skipping")
                        continue
                except (TypeError, KeyError, AttributeError):
                    logger.warning(u"Mendeley: NO YEAR for aliases, skipping")
                    continue  # nothing to see here.  Skip to next record

                if (mendeley_title == aliases_title):
                    if (str(mendeley_record["year"]) == str(biblio["year"])):

                        # check if author name in common. if not, yell, but continue anyway
                        first_mendeley_surname = mendeley_record["authors"][0][
                            "surname"]
                        has_matching_authors = first_mendeley_surname.lower(
                        ) in biblio["authors"].lower()
                        if not has_matching_authors:
                            logger.warning(
                                u"Mendeley: NO MATCHING AUTHORS between %s and %s"
                                % (first_mendeley_surname, biblio["authors"]))
                        # but return it anyway
                        response = {}
                        for id_type in ["uuid", "mendeley_url", "doi", "pmid"]:
                            try:
                                if mendeley_record[id_type]:
                                    if id_type == "mendeley_url":
                                        response["url"] = mendeley_record[
                                            id_type]
                                    else:
                                        response[id_type] = mendeley_record[
                                            id_type]
                            except KeyError:
                                pass
                        return response
                    else:
                        logger.debug(
                            u"Mendeley: years don't match %s and %s" % (str(
                                mendeley_record["year"]), str(biblio["year"])))
                else:
                    logger.debug(
                        u"Mendeley: titles don't match /biblio_print %s and %s"
                        % (self.remove_punctuation(mendeley_record["title"]),
                           self.remove_punctuation(biblio["title"])))
        # no joy
        return None
예제 #44
0
 def _extract_members(self, page, query_string): 
     data = provider._load_json(page)
     hits = [hit["name"] for hit in data]
     members = [("github", (query_string, hit)) for hit in list(set(hits))]
     return(members)
예제 #45
0
 def _extract_members(self, page, query_string): 
     data = provider._load_json(page)
     hits = [hit["name"] for hit in data]
     members = [("url", self.repo_url_template %(query_string, hit)) for hit in list(set(hits))]
     return(members)
예제 #46
0
 def _extract_members(self, page, query_string):
     data = provider._load_json(page)
     hits = [hit["name"] for hit in data]
     members = [("url", self.repo_url_template % (query_string, hit))
                for hit in list(set(hits))]
     return (members)