def update_from_dblp(commit=False): """ Pull data from DBLP and update database. .. note :: Existing entries are not updated. This is deliberate. :param commit: if `True` commit result to disk. """ new = [] for group_member, predicate in dblp_pids(): logging.info("Fetching user '{group_member}'".format(group_member=group_member)) root = dblp_fetch(group_member) publications = dblp_parse(root) for publication in publications: # we may have added the authors to the DB in the meantime, avoid duplicates by rechecking publication.authors = [ Author.from_dblp_pid(session, pid=author.dblp_pid, name=author.name) for author in publication.authors ] if publication.visibility is None and predicate(publication): logging.info("Added '{publication}'".format(publication=publication)) publication.visibility = True new.append(publication) if publication.id is None: session.add(publication) if commit: session.commit() return tuple(new)
def fetch_new_articles(api_client, last_id, max_count): from db import Author, Source iter = new_articles_iterator(api_client=api_client, last_id=last_id, max_count=max_count) for i, batch in enumerate(iter): print(f'batch_no={i}') batch = [map_article(a) for a in batch] with get_engine().begin() as engine: engine.execute( insert(Source).on_conflict_do_nothing(), [ art.source.__dict__ for art in batch if art.source is not None ]) engine.execute(Author.upsert_query(), [ art.author.__dict__ for art in batch if art.author is not None ]) _save_all(batch, merge=True)
def get_or_create_author(session, first, last): authors = session.query(Author).filter(Author.first == first, last == Author.last).all() if len(authors) == 0: author = Author(first=first, last=last) session.add(author) elif len(authors) == 1: if prompt_bool('Author {} {} found, reuse?'.format(first, last)): return authors[0] else: author = Author(first=first, last=last) session.add(author) elif len(authors) > 1: raise Exception('Multiple authors found for name') print('Creating author {} {}'.format(first, last)) author.gender = prompt_options('Gender?', ['M', 'F', 'NB', 'NA']) author.nationality = prompt('Nationality?') author.poc = prompt_bool('POC?') return author
from db import Database, Book, Author, Genre db = Database(dbtype='sqlite', dbname='library.db') first_author = Author() first_author.id = 1 first_author.name = "Jeff Kinney" db.create(first_author) first_genre = Genre() first_genre.name = "komedie" first_book = Book() first_book.id = 1 first_book.name = "Deník malého poseroutky 1" first_book.year = 2007 first_book.author = 1 first_book.genre = "komedie" db.create(first_book) authors = db.read_authors() for author in authors: print(f'{author.name}') if db.read_by_id(1): book = db.read_by_id(1) book.name = 'Deník malého poseroutky 2' book.year = 2009 db.update() db.delete(1)
def map_author(author, source): if author is None: return None return Author(id=author['id'], name=author['name'], source_id=source.id)
def create_author(author_name): new_author = Author(name = author_name) db.session.add(new_author) db.session.commit() return new_author.serialize_author()
def dblp_parse(root): """ Parse DBLP XML :param root: `xml.etree.ElementTree` output of `dblp_fetch` :returns: a list of `Publication`s """ publications = [] for child in root: if not child.tag == "r": continue publication = list(child)[0] dblp_key = publication.attrib["key"] mdate = datetime.date.fromisoformat(publication.attrib["mdate"]) publication_type = None if publication.tag == "article": if "publtype" in publication.attrib and publication.attrib["publtype"] == "informal": publication_type = "informal" else: publication_type = "article" elif publication.tag in PUBLICATION_TYPES: publication_type = publication.tag else: raise ValueError( "Type of publication for '%s' not understood" % ET.tostring(publication) ) author_tag = "editor" if publication_type in ("proceedings",) else "author" authors = [] for author in publication.findall(author_tag): author_name = author.text # Foo Bar 0001 is a thing on DBLP author_name = re.match("([^0-9]*)([0-9]+)?", author_name).group(1).strip() authors.append(Author.from_dblp_pid(session, author.attrib["pid"], author_name)) # many-to-many relations don't preserve order but author order can matter so we store it manually author_order = Publication.author_orderf(authors) title = publication.findtext("title") if title.endswith("."): title = title[:-1] year = int(publication.findtext("year")) url = publication.findtext("ee") dblp_url = publication.findtext("url") pages = publication.findtext("pages", "") if publication_type in ("article", "informal"): venue = publication.findtext("journal") elif publication_type == "inproceedings": venue = publication.findtext("booktitle") elif publication_type == "incollection": venue = publication.findtext("booktitle") elif publication.tag == "phdthesis": venue = publication.findtext("school") elif publication.tag == "book": venue = publication.findtext("publisher") elif publication.tag == "proceedings": venue = publication.findtext("publisher") else: raise ValueError( "Type of publication for '%s' not understood when parsing for venue" % ET.tostring(publication) ) volume = publication.findtext("volume", "") number = publication.findtext("number", "") # IACR ePrint is so important to us we treat is specially if venue == "IACR Cryptol. ePrint Arch.": number = re.match("http(s)?://eprint.iacr.org/([0-9]{4})/([0-9]+)", url).group(3) publications.append( # get it from DB if it exists, otherwise create new entry Publication.from_dblp_key( session, key=dblp_key, type=publication_type, authors=authors, author_order=author_order, title=title, pages=pages, venue=venue, volume=volume, number=number, year=year, url=url, dblp_url=dblp_url, dblp_mdate=mdate, visibility=None, ) ) logging.debug("Found '{publication}'".format(publication=publications[-1])) return publications
def clean_song(manager, song): """ Cleans the search title, rebuilds the search lyrics, adds a default author if the song does not have one and other clean ups. This should always called when a new song is added or changed. ``manager`` The song's manager. ``song`` The song object. """ if isinstance(song.title, buffer): song.title = unicode(song.title) if isinstance(song.alternate_title, buffer): song.alternate_title = unicode(song.alternate_title) if isinstance(song.lyrics, buffer): song.lyrics = unicode(song.lyrics) if song.title: song.title = clean_title(song.title) else: song.title = u'' if song.alternate_title: song.alternate_title = clean_title(song.alternate_title) else: song.alternate_title = u'' song.search_title = clean_string(song.title) + u'@' + clean_string(song.alternate_title) # Only do this, if we the song is a 1.9.4 song (or older). if song.lyrics.find(u'<lyrics language="en">') != -1: # Remove the old "language" attribute from lyrics tag (prior to 1.9.5). # This is not very important, but this keeps the database clean. This # can be removed when everybody has cleaned his songs. song.lyrics = song.lyrics.replace(u'<lyrics language="en">', u'<lyrics>') verses = SongXML().get_verses(song.lyrics) song.search_lyrics = u' '.join([clean_string(verse[1]) for verse in verses]) # We need a new and clean SongXML instance. sxml = SongXML() # Rebuild the song's verses, to remove any wrong verse names (for # example translated ones), which might have been added prior to 1.9.5. # List for later comparison. compare_order = [] for verse in verses: verse_type = VerseType.Tags[VerseType.from_loose_input(verse[0][u'type'])] sxml.add_verse_to_lyrics( verse_type, verse[0][u'label'], verse[1], verse[0].get(u'lang') ) compare_order.append((u'%s%s' % (verse_type, verse[0][u'label'])).upper()) if verse[0][u'label'] == u'1': compare_order.append(verse_type.upper()) song.lyrics = unicode(sxml.extract_xml(), u'utf-8') # Rebuild the verse order, to convert translated verse tags, which might # have been added prior to 1.9.5. if song.verse_order: order = CONTROL_CHARS.sub(u'', song.verse_order).strip().split() else: order = [] new_order = [] for verse_def in order: verse_type = VerseType.Tags[ VerseType.from_loose_input(verse_def[0])] if len(verse_def) > 1: new_order.append((u'%s%s' % (verse_type, verse_def[1:])).upper()) else: new_order.append(verse_type.upper()) song.verse_order = u' '.join(new_order) # Check if the verse order contains tags for verses which do not exist. for order in new_order: if order not in compare_order: song.verse_order = u'' break else: verses = SongXML().get_verses(song.lyrics) song.search_lyrics = u' '.join([clean_string(verse[1]) for verse in verses]) # The song does not have any author, add one. if not song.authors: name = SongStrings.AuthorUnknown author = manager.get_object_filtered(Author, Author.display_name == name) if author is None: author = Author.populate(display_name=name, last_name=u'', first_name=u'') song.authors.append(author) if song.copyright: song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip()
def olx_ad_parser(self, url): is_ad = True if '/obyavlenie/' in url else False self.browser.get(url) sleep(5) if is_ad: phone_button = self.browser.find_element_by_xpath( '//div[contains(@class, "contact-button")]') phone_button.click() sleep(2) html_code = self.browser.page_source # browser.quit() dom_tree = html.fromstring(html_code) dom_tree.make_links_absolute(base_url=url) if is_ad: try: ad_name = dom_tree.xpath('//h1')[0].text.strip() except Exception as e: ad_name = 'not found' try: ad_price = dom_tree.xpath( '//div[@class="price-label"]')[0].text_content().strip() except Exception as e: ad_price = 'not found' try: ad_image = dom_tree.xpath( '//div[@id="photo-gallery-opener"]/img/@src')[0] except Exception as e: ad_image = 'not found' author = dom_tree.xpath( '//div[@class="offer-user__details "]/h4/a') phone = dom_tree.xpath( '//div[contains(@class, "contact-button")]/strong')[0].text author = { 'name': author[0].text.strip(), 'profile_link': author[0].attrib['href'], 'phone': phone } db_author, created = Author.get_or_create(**author) categories = dom_tree.xpath('//td[@class="middle"]/ul/li/a') db_cats = [] for n, cat in enumerate(categories): if n == 0: parent = 0 else: parent = db_cats[n - 1].id cat = { 'name': cat.text_content().strip(), 'link': cat.attrib['href'], 'parent': parent } cat, created = Category.get_or_create(**cat) db_cats.append(cat) print(db_cats) ad = { 'author': db_author, 'url': url, 'name': ad_name.strip(), 'price': ad_price, 'image': ad_image, 'date': datetime.now() } db_ad = Ad.create(**ad) for cat in db_cats: db_ad.categories.add(cat) print(ad) links = dom_tree.xpath('//a/@href') links = {x for x in links} return links