Exemplo n.º 1
0
 def _retrieve_links(self, publications):
     if self.lamr is None:
         return
     pubs_by_uids = {}
     for pub in publications:
         for id in Identifier.find_by_type(pub.identifiers, 'WOK'):
             pubs_by_uids[id.value] = pub
     uids = pubs_by_uids.keys()
     result_by_uids = self.lamr.retrieve_by_ids(uids)
     for uid, result in result_by_uids.iteritems():
         pub = pubs_by_uids[uid]
         if 'timesCited' in result:
             pub.times_cited = int(result['timesCited'])
         if 'sourceURL' in result:
             pub.source_urls.append(
                 URL(result['sourceURL'],
                     type='WOK',
                     description=u'Web of Science®'))
         if 'citingArticlesURL' in result:
             pub.cite_urls.append(
                 URL(result['citingArticlesURL'],
                     type='WOK',
                     description=u'Web of Science®'))
         if 'message' in result:
             pub.errors.append(u'Failed loading article URLs: ' +
                               unicode(result['message']))
Exemplo n.º 2
0
def populate(file):

    pp.pprint("parsing file %s" % file)
    file_reader = open(file, "r")
    url_objects = []
    for line in file_reader:
        if not line.startswith("#"):
            pp.pprint(line)
            line = line.strip()
            line = line.strip("'")
            line = line.strip("\n")
            pp.pprint(line)
            url = URL(util.hash_domain(line), util.hash_url(line))
            if (not url.hash_domain == "" and not url.hash_url == ""):
                url_objects.append(url)
                db_session.add(url)
            db_session.commit()

    pp.pprint(url_objects)
    """
    TODO: this doesn't work with the large data set, perhaps there is a max without any errors?
    Will create a SQL script to insert manually into DB

    try:
        db_session.bulk_save_objects(url_objects)
        db_session.commit()
    except exc.IntegrityError:
        db_session.rollback()
    """
    results = URL.query.all()

    pp.pprint("Inserted %d rows" % len(results))
Exemplo n.º 3
0
 def get(self):
     template_values = {
         'tasks': Task.all(),
         'urls': URL.all(),
         'settings': settings,
         'datetime_now': datetime.now(pytz.timezone(settings.TIMEZONE)).strftime(settings.TIMEFORMAT)
     }
     template = JINJA_ENVIRONMENT.get_template('templates/index.html')
     self.response.write(template.render(template_values))
Exemplo n.º 4
0
 def search_citations(self, publications):
     for publication in publications:
         ut = list(Identifier.find_by_type(publication.identifiers, 'WOK'))
         if len(ut) == 0:
             continue
         ut = ut[0].value.lstrip(u'WOS:')
         for cite_url in URL.find_by_type(publication.cite_urls, 'WOK'):
             for pub in self._get_citations_from_url(cite_url.value, ut):
                 yield pub
Exemplo n.º 5
0
    def add_url(self, url, title, hostmask_id, channel=None):
        with self.dbcon.scoped_db_session() as session:
            url_qr = session.query(URL).filter(URL.url==url).all() # One?
            if len(url_qr) == 0:
                url_obj = URL()
                #self.connection.privmsg("jabr", type(url_obj.hostmask_id))

                url_obj.url = url
                url_obj.title = title
                url_obj.channel = channel
                url_obj.hostmask_id = hostmask_id
                session.add(url_obj)
                msg = "URL %s added by %d" % (url, hostmask_id)
                self.connection.privmsg("jabr", msg)
            elif len(url_qr) == 1:
                msg = "URL %s already exist" % url
                self.connection.privmsg("jabr", msg)
            else:
                msg = "ERROR: %d instances of URL (%s) in DB" % (len(url_qr), url)
                self.connection.privmsg("jabr", msg)
Exemplo n.º 6
0
 def search_citations(self, publications):
     for publication in publications:
         eid = list(
             Identifier.find_by_type(publication.identifiers, 'SCOPUS'))
         if len(eid) == 0:
             continue
         eid = eid[0].value
         detail_url = list(
             URL.find_by_type(publication.source_urls, 'SCOPUS'))
         if len(detail_url) == 0:
             continue
         detail_url = detail_url[0].value
         for pub in self._get_citations_from_detail_url(detail_url, eid):
             yield pub
Exemplo n.º 7
0
 def receive(self, message):
     logging.info("Processing a message from: " + message.sender)
     plaintext_bodies = message.bodies('text/plain')
     for content_type, body in plaintext_bodies:
         text = body.decode()
         logging.info("Text: " + text)
         text = text.strip().split('#')
         #format: phone_number#url_id#periods#dates#sms_type#sms_sender#password
         if len(text) == 7 and text[6] == settings.MAIL_PASSWORD:
             logging.info('weather request...')
             text[1] = URL.get_by_id(int(text[1])).url #get URL string based on id in DB
             utils.addTask(*text[:6])
         #format: phone_number#text#password
         elif len(text) == 3 and text[2] == settings.MAIL_PASSWORD:
             logging.info('sms request...')
             gs = GSMService()
             gs.sendSMS(text[1], text[0], 'poland')
         else:
             logging.error("Password or format incorrect")
Exemplo n.º 8
0
    def add_url(self, url, title, hostmask_id, channel=None):
        with self.dbcon.scoped_db_session() as session:
            url_qr = session.query(URL).filter(URL.url == url).all()  # One?
            if len(url_qr) == 0:
                url_obj = URL()
                #self.connection.privmsg("jabr", type(url_obj.hostmask_id))

                url_obj.url = url
                url_obj.title = title
                url_obj.channel = channel
                url_obj.hostmask_id = hostmask_id
                session.add(url_obj)
                msg = "URL %s added by %d" % (url, hostmask_id)
                self.connection.privmsg("jabr", msg)
            elif len(url_qr) == 1:
                msg = "URL %s already exist" % url
                self.connection.privmsg("jabr", msg)
            else:
                msg = "ERROR: %d instances of URL (%s) in DB" % (len(url_qr),
                                                                 url)
                self.connection.privmsg("jabr", msg)
Exemplo n.º 9
0
 def post(self):
     url = URL(url=self.request.get('newURL').strip())
     url.put()
     self.redirect('/')
Exemplo n.º 10
0
"""
pp = pprint.PrettyPrinter(indent=4)


def populate():
    db_session.add(url1)
    db_session.add(url2)
    db_session.add(url3)
    db_session.add(url4)
    db_session.add(url5)

    db_session.commit()
    results = URL.query.all()
    pp.pprint(results)


if __name__ == '__main__':

    url1 = URL(util.hash_domain('https://googo.com/gmail'),
               util.hash_url('https://googo.com/gmail'))
    url2 = URL(util.hash_domain('https://docs.googo.com/spreadsheets/u/01/'),
               util.hash_url('https://docs.googo.com/spreadsheets/u/01/'))
    url3 = URL(util.hash_domain('https://docs.googo.com/document/u/0/1'),
               util.hash_url('https://docs.googo.com/document/u/0/1'))
    url4 = URL(util.hash_domain('https://www.appa.com/mac/index.html'),
               util.hash_url('https://www.appa.com/mac/index.html'))
    url5 = URL(util.hash_domain('https://www.appa.com/ipad/stuff.htm'),
               util.hash_url('https://www.appa.com/ipad/stuff.htm'))

    populate()
Exemplo n.º 11
0
    test_domain5 = "www2.stuffandthings.co.au"

    test_parse_url1 = "https://google.com/gmail/"
    test_parse_url2 = "https://go.google.com/somethingelse"
    test_parse_url3 = "http://stuffandthings.co.au/somethingelse/anotherthing"
    test_parse_url4 = "http://www.stuffandthings.co.au/somethingelse/anotherthing"
    test_parse_url5 = "http://www.stuffandthings.co.au:8080/somethingelse/anotherthing/yaya.html"

    test_parse_path1 = "httpsgoogle.com/gmail"
    test_parse_path2 = "httpsgo.google.com/somethingelse"
    test_parse_path3 = "httpstuffandthings.co.au/somethingelse/anotherthing"
    test_parse_path4 = "httpwww.stuffandthings.co.au/somethingelse/anotherthing"
    test_parse_path5 = "http8080www.stuffandthings.co.au/somethingelse/anotherthing/yaya.html"

    test_list = []
    url1 = URL(util.hash_domain(test_raw_url1), util.hash_url(test_raw_url1))
    url2 = URL(util.hash_domain(test_raw_url2), util.hash_url(test_raw_url2))
    url3 = URL(util.hash_domain(test_raw_url3), util.hash_url(test_raw_url3))
    url4 = URL(util.hash_domain(test_raw_url4), util.hash_url(test_raw_url4))
    url5 = URL(util.hash_domain(test_raw_url5), util.hash_url(test_raw_url5))

    test_list.append(url1)
    test_list.append(url2)
    test_list.append(url3)
    test_list.append(url4)
    test_list.append(url5)

    # Domain parsing tests
    test_parse_domain(test_raw_url1, test_domain1)
    test_parse_domain(test_raw_url2, test_domain2)
    test_parse_domain(test_raw_url3, test_domain3)
Exemplo n.º 12
0
    def _parse_csv(self, content, encoding='UTF-8'):
        csv = unicodecsv.DictReader(strip_bom(content).splitlines(),
                                    encoding=encoding)

        def empty_to_none(s):
            if s == None:
                return None
            s = s.strip()
            if len(s) == 0:
                return None
            return s

        def list_remove_empty(l):
            r = []
            for x in l:
                v = empty_to_none(x)
                if v:
                    r.append(v)
            return r

        def to_num(x):
            x = x.strip()
            if len(x) == 0:
                return 0
            return int(x)

        for line in csv:
            if line['Authors'] == '[No author name available]':
                authors = []
            else:
                # (mrshu): SCOPUS sa rozhodol oddelovat ako priezvyska, tak aj
                # jednotlive mena autorov ciarkov. Toto robi problemy, preto
                # preprocessujeme zoznam autorov, ktory vyzera napriklad
                #
                # Brejová, B., Brown, D.G., Li, M., Vinař, T.
                #
                # najdeme, konce celych mien, a ciarku v tomto pripade nahradime
                # bodkociarkou. Nasledne potom funkcii, ktora mena autorov spracovava
                # dame vediet, ze je ako separator pouzita bodkociarka.
                line['Authors'] = re.sub(r'\.,', ';', line['Authors'])
                authors = Author.parse_sn_first_list(line['Authors'],
                                                     separator=u';')
            pub = Publication(line['Title'], authors, to_num(line['Year']))
            source_title = empty_to_none(line['Source title'])
            if source_title:
                source_title, replacements = re.subn(
                    r' \(including subseries [^)]+\)', '', source_title)
                source_title = source_title.strip()
                if replacements:
                    pub.series = source_title
                else:
                    pub.published_in = source_title
            pub.volume = empty_to_none(line['Volume'])
            pub.issue = empty_to_none(line['Issue'])
            pub.pages = make_page_range(empty_to_none(line['Page start']),
                                        empty_to_none(line['Page end']))

            # (mrshu): z dovodu, ktory nedokazem pochopit teraz SCOPUS vracia cosi
            # ako 'DOILink', kde da dohromady tieto dva fieldy. Nepodarilo sa mi
            # prist na to ako to spravit rozumnejsie, tento hack to aspon rozparsuje
            splits = line['DOILink'].split('"')
            if len(splits) > 1:
                line['Link'] = splits[1]
                line['DOI'] = splits[0]
            else:
                line['Link'] = splits[0]
                line['DOI'] = None

            pub.times_cited = empty_to_none(line['Cited by'])
            pub.article_no = empty_to_none(line['Art. No.'])
            pub.publisher = empty_to_none(line['Publisher'])
            url = empty_to_none(line['Link'])

            if url:
                pub.source_urls.append(
                    URL(url, type='SCOPUS', description='SCOPUS'))
                url_parts = urlparse(url)
                url_query = parse_qs(url_parts.query)
                if 'eid' in url_query and len:
                    pub.identifiers.append(
                        Identifier(url_query['eid'][0], type='SCOPUS'))

            for issn in list_remove_empty(line['ISSN'].split(u';')):
                pub.identifiers.append(Identifier(issn, type='ISSN'))

            for isbn in list_remove_empty(line['ISBN'].split(u';')):
                pub.identifiers.append(Identifier(isbn, type='ISBN'))

            doi = empty_to_none(line['DOI'])
            if doi:
                pub.identifiers.append(Identifier(doi, type='DOI'))

            pub.indexes.append(Index('SCOPUS', type='SCOPUS'))

            yield pub
Exemplo n.º 13
0
    def entries_to_publications(self, entries):
        """Prerobi data zo SCOPUS json reprezentacie na internu Publication."""

        def empty_to_none(s):
            if s is None:
                return None
            s = s.strip()
            if len(s) == 0:
                return None
            return s

        def exists_to_none(d, key):
            if key in d:
                if type(d[key]) is list:
                    return [empty_to_none(x['$']) for x in d[key]]
                else:
                    return empty_to_none(d[key])
            else:
                return None

        def append_identifier(d, key, obj, type):
            ids = exists_to_none(d, key)
            if ids:
                if isinstance(ids, list):
                    for id in ids:
                        obj.identifiers.append(Identifier(id, type=type))
                else:
                    obj.identifiers.append(Identifier(ids, type=type))

        for entry in entries:
            author_count = int(entry['author-count']['$'])
            if author_count == 0:
                authors = []
            else:
                authors = self.authors_from_json(entry['author'])

            year = empty_to_none(entry['prism:coverDate'])
            if year:
                year = int(year.split('-')[0])
            pub = Publication(empty_to_none(entry['dc:title']), authors, year)
            pub.times_cited = empty_to_none(entry['citedby-count'])

            source_title = exists_to_none(entry, 'prism:publicationName')
            if source_title:
                source_title, replacements = re.subn(INCLUDING_RE,
                                                     '',
                                                     source_title)
                source_title = source_title.strip()
                if replacements:
                    pub.series = source_title
                else:
                    pub.published_in = source_title

            url = self.find_next_url(entry['link'], ref='scopus')
            pub.source_urls.append(URL(url,
                                       type='SCOPUS',
                                       description='SCOPUS'))

            citedby_url = self.find_next_url(entry['link'],
                                             ref='scopus-citedby')
            if citedby_url is not None:
                pub.cite_urls.append(URL(citedby_url,
                                         type='SCOPUS',
                                         description='SCOPUS'))

            pub.pages = exists_to_none(entry, 'prism:pageRange')
            pub.volume = exists_to_none(entry, 'prism:volume')
            pub.issue = exists_to_none(entry, 'prism:issueIdentifier')
            pub.pages = exists_to_none(entry, 'prism:pageRange')

            append_identifier(entry, 'prism:doi', pub, 'DOI')
            append_identifier(entry, 'prism:isbn', pub, 'ISBN')
            append_identifier(entry, 'prism:issn', pub, 'ISSN')
            append_identifier(entry, 'eid', pub, 'SCOPUS')

            pub.indexes.append(Index('SCOPUS', type='SCOPUS'))

            yield pub