def _retrieve_links(self, publications): if self.lamr is None: return pubs_by_uids = {} for pub in publications: for id in Identifier.find_by_type(pub.identifiers, 'WOK'): pubs_by_uids[id.value] = pub uids = pubs_by_uids.keys() result_by_uids = self.lamr.retrieve_by_ids(uids) for uid, result in result_by_uids.iteritems(): pub = pubs_by_uids[uid] if 'timesCited' in result: pub.times_cited = int(result['timesCited']) if 'sourceURL' in result: pub.source_urls.append( URL(result['sourceURL'], type='WOK', description=u'Web of Science®')) if 'citingArticlesURL' in result: pub.cite_urls.append( URL(result['citingArticlesURL'], type='WOK', description=u'Web of Science®')) if 'message' in result: pub.errors.append(u'Failed loading article URLs: ' + unicode(result['message']))
def populate(file): pp.pprint("parsing file %s" % file) file_reader = open(file, "r") url_objects = [] for line in file_reader: if not line.startswith("#"): pp.pprint(line) line = line.strip() line = line.strip("'") line = line.strip("\n") pp.pprint(line) url = URL(util.hash_domain(line), util.hash_url(line)) if (not url.hash_domain == "" and not url.hash_url == ""): url_objects.append(url) db_session.add(url) db_session.commit() pp.pprint(url_objects) """ TODO: this doesn't work with the large data set, perhaps there is a max without any errors? Will create a SQL script to insert manually into DB try: db_session.bulk_save_objects(url_objects) db_session.commit() except exc.IntegrityError: db_session.rollback() """ results = URL.query.all() pp.pprint("Inserted %d rows" % len(results))
def get(self): template_values = { 'tasks': Task.all(), 'urls': URL.all(), 'settings': settings, 'datetime_now': datetime.now(pytz.timezone(settings.TIMEZONE)).strftime(settings.TIMEFORMAT) } template = JINJA_ENVIRONMENT.get_template('templates/index.html') self.response.write(template.render(template_values))
def search_citations(self, publications): for publication in publications: ut = list(Identifier.find_by_type(publication.identifiers, 'WOK')) if len(ut) == 0: continue ut = ut[0].value.lstrip(u'WOS:') for cite_url in URL.find_by_type(publication.cite_urls, 'WOK'): for pub in self._get_citations_from_url(cite_url.value, ut): yield pub
def add_url(self, url, title, hostmask_id, channel=None): with self.dbcon.scoped_db_session() as session: url_qr = session.query(URL).filter(URL.url==url).all() # One? if len(url_qr) == 0: url_obj = URL() #self.connection.privmsg("jabr", type(url_obj.hostmask_id)) url_obj.url = url url_obj.title = title url_obj.channel = channel url_obj.hostmask_id = hostmask_id session.add(url_obj) msg = "URL %s added by %d" % (url, hostmask_id) self.connection.privmsg("jabr", msg) elif len(url_qr) == 1: msg = "URL %s already exist" % url self.connection.privmsg("jabr", msg) else: msg = "ERROR: %d instances of URL (%s) in DB" % (len(url_qr), url) self.connection.privmsg("jabr", msg)
def search_citations(self, publications): for publication in publications: eid = list( Identifier.find_by_type(publication.identifiers, 'SCOPUS')) if len(eid) == 0: continue eid = eid[0].value detail_url = list( URL.find_by_type(publication.source_urls, 'SCOPUS')) if len(detail_url) == 0: continue detail_url = detail_url[0].value for pub in self._get_citations_from_detail_url(detail_url, eid): yield pub
def receive(self, message): logging.info("Processing a message from: " + message.sender) plaintext_bodies = message.bodies('text/plain') for content_type, body in plaintext_bodies: text = body.decode() logging.info("Text: " + text) text = text.strip().split('#') #format: phone_number#url_id#periods#dates#sms_type#sms_sender#password if len(text) == 7 and text[6] == settings.MAIL_PASSWORD: logging.info('weather request...') text[1] = URL.get_by_id(int(text[1])).url #get URL string based on id in DB utils.addTask(*text[:6]) #format: phone_number#text#password elif len(text) == 3 and text[2] == settings.MAIL_PASSWORD: logging.info('sms request...') gs = GSMService() gs.sendSMS(text[1], text[0], 'poland') else: logging.error("Password or format incorrect")
def add_url(self, url, title, hostmask_id, channel=None): with self.dbcon.scoped_db_session() as session: url_qr = session.query(URL).filter(URL.url == url).all() # One? if len(url_qr) == 0: url_obj = URL() #self.connection.privmsg("jabr", type(url_obj.hostmask_id)) url_obj.url = url url_obj.title = title url_obj.channel = channel url_obj.hostmask_id = hostmask_id session.add(url_obj) msg = "URL %s added by %d" % (url, hostmask_id) self.connection.privmsg("jabr", msg) elif len(url_qr) == 1: msg = "URL %s already exist" % url self.connection.privmsg("jabr", msg) else: msg = "ERROR: %d instances of URL (%s) in DB" % (len(url_qr), url) self.connection.privmsg("jabr", msg)
def post(self): url = URL(url=self.request.get('newURL').strip()) url.put() self.redirect('/')
""" pp = pprint.PrettyPrinter(indent=4) def populate(): db_session.add(url1) db_session.add(url2) db_session.add(url3) db_session.add(url4) db_session.add(url5) db_session.commit() results = URL.query.all() pp.pprint(results) if __name__ == '__main__': url1 = URL(util.hash_domain('https://googo.com/gmail'), util.hash_url('https://googo.com/gmail')) url2 = URL(util.hash_domain('https://docs.googo.com/spreadsheets/u/01/'), util.hash_url('https://docs.googo.com/spreadsheets/u/01/')) url3 = URL(util.hash_domain('https://docs.googo.com/document/u/0/1'), util.hash_url('https://docs.googo.com/document/u/0/1')) url4 = URL(util.hash_domain('https://www.appa.com/mac/index.html'), util.hash_url('https://www.appa.com/mac/index.html')) url5 = URL(util.hash_domain('https://www.appa.com/ipad/stuff.htm'), util.hash_url('https://www.appa.com/ipad/stuff.htm')) populate()
test_domain5 = "www2.stuffandthings.co.au" test_parse_url1 = "https://google.com/gmail/" test_parse_url2 = "https://go.google.com/somethingelse" test_parse_url3 = "http://stuffandthings.co.au/somethingelse/anotherthing" test_parse_url4 = "http://www.stuffandthings.co.au/somethingelse/anotherthing" test_parse_url5 = "http://www.stuffandthings.co.au:8080/somethingelse/anotherthing/yaya.html" test_parse_path1 = "httpsgoogle.com/gmail" test_parse_path2 = "httpsgo.google.com/somethingelse" test_parse_path3 = "httpstuffandthings.co.au/somethingelse/anotherthing" test_parse_path4 = "httpwww.stuffandthings.co.au/somethingelse/anotherthing" test_parse_path5 = "http8080www.stuffandthings.co.au/somethingelse/anotherthing/yaya.html" test_list = [] url1 = URL(util.hash_domain(test_raw_url1), util.hash_url(test_raw_url1)) url2 = URL(util.hash_domain(test_raw_url2), util.hash_url(test_raw_url2)) url3 = URL(util.hash_domain(test_raw_url3), util.hash_url(test_raw_url3)) url4 = URL(util.hash_domain(test_raw_url4), util.hash_url(test_raw_url4)) url5 = URL(util.hash_domain(test_raw_url5), util.hash_url(test_raw_url5)) test_list.append(url1) test_list.append(url2) test_list.append(url3) test_list.append(url4) test_list.append(url5) # Domain parsing tests test_parse_domain(test_raw_url1, test_domain1) test_parse_domain(test_raw_url2, test_domain2) test_parse_domain(test_raw_url3, test_domain3)
def _parse_csv(self, content, encoding='UTF-8'): csv = unicodecsv.DictReader(strip_bom(content).splitlines(), encoding=encoding) def empty_to_none(s): if s == None: return None s = s.strip() if len(s) == 0: return None return s def list_remove_empty(l): r = [] for x in l: v = empty_to_none(x) if v: r.append(v) return r def to_num(x): x = x.strip() if len(x) == 0: return 0 return int(x) for line in csv: if line['Authors'] == '[No author name available]': authors = [] else: # (mrshu): SCOPUS sa rozhodol oddelovat ako priezvyska, tak aj # jednotlive mena autorov ciarkov. Toto robi problemy, preto # preprocessujeme zoznam autorov, ktory vyzera napriklad # # Brejová, B., Brown, D.G., Li, M., Vinař, T. # # najdeme, konce celych mien, a ciarku v tomto pripade nahradime # bodkociarkou. Nasledne potom funkcii, ktora mena autorov spracovava # dame vediet, ze je ako separator pouzita bodkociarka. line['Authors'] = re.sub(r'\.,', ';', line['Authors']) authors = Author.parse_sn_first_list(line['Authors'], separator=u';') pub = Publication(line['Title'], authors, to_num(line['Year'])) source_title = empty_to_none(line['Source title']) if source_title: source_title, replacements = re.subn( r' \(including subseries [^)]+\)', '', source_title) source_title = source_title.strip() if replacements: pub.series = source_title else: pub.published_in = source_title pub.volume = empty_to_none(line['Volume']) pub.issue = empty_to_none(line['Issue']) pub.pages = make_page_range(empty_to_none(line['Page start']), empty_to_none(line['Page end'])) # (mrshu): z dovodu, ktory nedokazem pochopit teraz SCOPUS vracia cosi # ako 'DOILink', kde da dohromady tieto dva fieldy. Nepodarilo sa mi # prist na to ako to spravit rozumnejsie, tento hack to aspon rozparsuje splits = line['DOILink'].split('"') if len(splits) > 1: line['Link'] = splits[1] line['DOI'] = splits[0] else: line['Link'] = splits[0] line['DOI'] = None pub.times_cited = empty_to_none(line['Cited by']) pub.article_no = empty_to_none(line['Art. No.']) pub.publisher = empty_to_none(line['Publisher']) url = empty_to_none(line['Link']) if url: pub.source_urls.append( URL(url, type='SCOPUS', description='SCOPUS')) url_parts = urlparse(url) url_query = parse_qs(url_parts.query) if 'eid' in url_query and len: pub.identifiers.append( Identifier(url_query['eid'][0], type='SCOPUS')) for issn in list_remove_empty(line['ISSN'].split(u';')): pub.identifiers.append(Identifier(issn, type='ISSN')) for isbn in list_remove_empty(line['ISBN'].split(u';')): pub.identifiers.append(Identifier(isbn, type='ISBN')) doi = empty_to_none(line['DOI']) if doi: pub.identifiers.append(Identifier(doi, type='DOI')) pub.indexes.append(Index('SCOPUS', type='SCOPUS')) yield pub
def entries_to_publications(self, entries): """Prerobi data zo SCOPUS json reprezentacie na internu Publication.""" def empty_to_none(s): if s is None: return None s = s.strip() if len(s) == 0: return None return s def exists_to_none(d, key): if key in d: if type(d[key]) is list: return [empty_to_none(x['$']) for x in d[key]] else: return empty_to_none(d[key]) else: return None def append_identifier(d, key, obj, type): ids = exists_to_none(d, key) if ids: if isinstance(ids, list): for id in ids: obj.identifiers.append(Identifier(id, type=type)) else: obj.identifiers.append(Identifier(ids, type=type)) for entry in entries: author_count = int(entry['author-count']['$']) if author_count == 0: authors = [] else: authors = self.authors_from_json(entry['author']) year = empty_to_none(entry['prism:coverDate']) if year: year = int(year.split('-')[0]) pub = Publication(empty_to_none(entry['dc:title']), authors, year) pub.times_cited = empty_to_none(entry['citedby-count']) source_title = exists_to_none(entry, 'prism:publicationName') if source_title: source_title, replacements = re.subn(INCLUDING_RE, '', source_title) source_title = source_title.strip() if replacements: pub.series = source_title else: pub.published_in = source_title url = self.find_next_url(entry['link'], ref='scopus') pub.source_urls.append(URL(url, type='SCOPUS', description='SCOPUS')) citedby_url = self.find_next_url(entry['link'], ref='scopus-citedby') if citedby_url is not None: pub.cite_urls.append(URL(citedby_url, type='SCOPUS', description='SCOPUS')) pub.pages = exists_to_none(entry, 'prism:pageRange') pub.volume = exists_to_none(entry, 'prism:volume') pub.issue = exists_to_none(entry, 'prism:issueIdentifier') pub.pages = exists_to_none(entry, 'prism:pageRange') append_identifier(entry, 'prism:doi', pub, 'DOI') append_identifier(entry, 'prism:isbn', pub, 'ISBN') append_identifier(entry, 'prism:issn', pub, 'ISSN') append_identifier(entry, 'eid', pub, 'SCOPUS') pub.indexes.append(Index('SCOPUS', type='SCOPUS')) yield pub