Пример #1
0
def update_from_dblp(commit=False):
    """
    Pull data from DBLP and update database.

    .. note :: Existing entries are not updated. This is deliberate.

    :param commit: if `True` commit result to disk.

    """
    new = []
    for group_member, predicate in dblp_pids():
        logging.info("Fetching user '{group_member}'".format(group_member=group_member))
        root = dblp_fetch(group_member)
        publications = dblp_parse(root)

        for publication in publications:
            # we may have added the authors to the DB in the meantime, avoid duplicates by rechecking
            publication.authors = [
                Author.from_dblp_pid(session, pid=author.dblp_pid, name=author.name)
                for author in publication.authors
            ]
            if publication.visibility is None and predicate(publication):
                logging.info("Added '{publication}'".format(publication=publication))
                publication.visibility = True
                new.append(publication)

            if publication.id is None:
                session.add(publication)

    if commit:
        session.commit()

    return tuple(new)
Пример #2
0
def fetch_new_articles(api_client, last_id, max_count):
    from db import Author, Source

    iter = new_articles_iterator(api_client=api_client,
                                 last_id=last_id,
                                 max_count=max_count)

    for i, batch in enumerate(iter):
        print(f'batch_no={i}')
        batch = [map_article(a) for a in batch]
        with get_engine().begin() as engine:
            engine.execute(
                insert(Source).on_conflict_do_nothing(), [
                    art.source.__dict__
                    for art in batch if art.source is not None
                ])
            engine.execute(Author.upsert_query(), [
                art.author.__dict__ for art in batch if art.author is not None
            ])

        _save_all(batch, merge=True)
Пример #3
0
def get_or_create_author(session, first, last):
    authors = session.query(Author).filter(Author.first == first,
                                           last == Author.last).all()
    if len(authors) == 0:
        author = Author(first=first, last=last)
        session.add(author)
    elif len(authors) == 1:
        if prompt_bool('Author {} {} found, reuse?'.format(first, last)):
            return authors[0]
        else:
            author = Author(first=first, last=last)
            session.add(author)
    elif len(authors) > 1:
        raise Exception('Multiple authors found for name')

    print('Creating author {} {}'.format(first, last))
    author.gender = prompt_options('Gender?', ['M', 'F', 'NB', 'NA'])
    author.nationality = prompt('Nationality?')
    author.poc = prompt_bool('POC?')

    return author
Пример #4
0
from db import Database, Book, Author, Genre

db = Database(dbtype='sqlite', dbname='library.db')

first_author = Author()
first_author.id = 1
first_author.name = "Jeff Kinney"
db.create(first_author)

first_genre = Genre()
first_genre.name = "komedie"

first_book = Book()
first_book.id = 1
first_book.name = "Deník malého poseroutky 1"
first_book.year = 2007
first_book.author = 1
first_book.genre = "komedie"
db.create(first_book)

authors = db.read_authors()
for author in authors:
    print(f'{author.name}')

if db.read_by_id(1):
    book = db.read_by_id(1)
    book.name = 'Deník malého poseroutky 2'
    book.year = 2009
    db.update()

db.delete(1)
Пример #5
0
def map_author(author, source):
    if author is None:
        return None

    return Author(id=author['id'], name=author['name'], source_id=source.id)
def create_author(author_name):
  new_author = Author(name = author_name)
  db.session.add(new_author)
  db.session.commit()
  return new_author.serialize_author()
Пример #7
0
def dblp_parse(root):
    """
    Parse DBLP XML

    :param root: `xml.etree.ElementTree` output of `dblp_fetch`
    :returns: a list of `Publication`s

    """

    publications = []

    for child in root:
        if not child.tag == "r":
            continue

        publication = list(child)[0]

        dblp_key = publication.attrib["key"]
        mdate = datetime.date.fromisoformat(publication.attrib["mdate"])

        publication_type = None
        if publication.tag == "article":
            if "publtype" in publication.attrib and publication.attrib["publtype"] == "informal":
                publication_type = "informal"
            else:
                publication_type = "article"
        elif publication.tag in PUBLICATION_TYPES:
            publication_type = publication.tag
        else:
            raise ValueError(
                "Type of publication for '%s' not understood" % ET.tostring(publication)
            )

        author_tag = "editor" if publication_type in ("proceedings",) else "author"

        authors = []
        for author in publication.findall(author_tag):
            author_name = author.text
            # Foo Bar 0001 is a thing on DBLP
            author_name = re.match("([^0-9]*)([0-9]+)?", author_name).group(1).strip()
            authors.append(Author.from_dblp_pid(session, author.attrib["pid"], author_name))

        # many-to-many relations don't preserve order but author order can matter so we store it manually
        author_order = Publication.author_orderf(authors)

        title = publication.findtext("title")
        if title.endswith("."):
            title = title[:-1]

        year = int(publication.findtext("year"))
        url = publication.findtext("ee")
        dblp_url = publication.findtext("url")
        pages = publication.findtext("pages", "")

        if publication_type in ("article", "informal"):
            venue = publication.findtext("journal")
        elif publication_type == "inproceedings":
            venue = publication.findtext("booktitle")
        elif publication_type == "incollection":
            venue = publication.findtext("booktitle")
        elif publication.tag == "phdthesis":
            venue = publication.findtext("school")
        elif publication.tag == "book":
            venue = publication.findtext("publisher")
        elif publication.tag == "proceedings":
            venue = publication.findtext("publisher")
        else:
            raise ValueError(
                "Type of publication for '%s' not understood when parsing for venue"
                % ET.tostring(publication)
            )

        volume = publication.findtext("volume", "")
        number = publication.findtext("number", "")
        # IACR ePrint is so important to us we treat is specially
        if venue == "IACR Cryptol. ePrint Arch.":
            number = re.match("http(s)?://eprint.iacr.org/([0-9]{4})/([0-9]+)", url).group(3)

        publications.append(
            # get it from DB if it exists, otherwise create new entry
            Publication.from_dblp_key(
                session,
                key=dblp_key,
                type=publication_type,
                authors=authors,
                author_order=author_order,
                title=title,
                pages=pages,
                venue=venue,
                volume=volume,
                number=number,
                year=year,
                url=url,
                dblp_url=dblp_url,
                dblp_mdate=mdate,
                visibility=None,
            )
        )
        logging.debug("Found '{publication}'".format(publication=publications[-1]))

    return publications
Пример #8
0
def clean_song(manager, song):
    """
    Cleans the search title, rebuilds the search lyrics, adds a default author
    if the song does not have one and other clean ups. This should always
    called when a new song is added or changed.

    ``manager``
        The song's manager.

    ``song``
        The song object.
    """
    if isinstance(song.title, buffer):
        song.title = unicode(song.title)
    if isinstance(song.alternate_title, buffer):
        song.alternate_title = unicode(song.alternate_title)
    if isinstance(song.lyrics, buffer):
        song.lyrics = unicode(song.lyrics)
    if song.title:
        song.title = clean_title(song.title)
    else:
        song.title = u''
    if song.alternate_title:
        song.alternate_title = clean_title(song.alternate_title)
    else:
        song.alternate_title = u''
    song.search_title = clean_string(song.title) + u'@' + clean_string(song.alternate_title)
    # Only do this, if we the song is a 1.9.4 song (or older).
    if song.lyrics.find(u'<lyrics language="en">') != -1:
        # Remove the old "language" attribute from lyrics tag (prior to 1.9.5).
        # This is not very important, but this keeps the database clean. This
        # can be removed when everybody has cleaned his songs.
        song.lyrics = song.lyrics.replace(u'<lyrics language="en">', u'<lyrics>')
        verses = SongXML().get_verses(song.lyrics)
        song.search_lyrics = u' '.join([clean_string(verse[1])
            for verse in verses])
        # We need a new and clean SongXML instance.
        sxml = SongXML()
        # Rebuild the song's verses, to remove any wrong verse names (for
        # example translated ones), which might have been added prior to 1.9.5.
        # List for later comparison.
        compare_order = []
        for verse in verses:
            verse_type = VerseType.Tags[VerseType.from_loose_input(verse[0][u'type'])]
            sxml.add_verse_to_lyrics(
                verse_type,
                verse[0][u'label'],
                verse[1],
                verse[0].get(u'lang')
            )
            compare_order.append((u'%s%s' % (verse_type, verse[0][u'label'])).upper())
            if verse[0][u'label'] == u'1':
                compare_order.append(verse_type.upper())
        song.lyrics = unicode(sxml.extract_xml(), u'utf-8')
        # Rebuild the verse order, to convert translated verse tags, which might
        # have been added prior to 1.9.5.
        if song.verse_order:
            order = CONTROL_CHARS.sub(u'', song.verse_order).strip().split()
        else:
            order = []
        new_order = []
        for verse_def in order:
            verse_type = VerseType.Tags[
                VerseType.from_loose_input(verse_def[0])]
            if len(verse_def) > 1:
                new_order.append((u'%s%s' % (verse_type, verse_def[1:])).upper())
            else:
                new_order.append(verse_type.upper())
        song.verse_order = u' '.join(new_order)
        # Check if the verse order contains tags for verses which do not exist.
        for order in new_order:
            if order not in compare_order:
                song.verse_order = u''
                break
    else:
        verses = SongXML().get_verses(song.lyrics)
        song.search_lyrics = u' '.join([clean_string(verse[1])
            for verse in verses])

    # The song does not have any author, add one.
    if not song.authors:
        name = SongStrings.AuthorUnknown
        author = manager.get_object_filtered(Author, Author.display_name == name)
        if author is None:
            author = Author.populate(display_name=name, last_name=u'', first_name=u'')
        song.authors.append(author)
    if song.copyright:
        song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip()
Пример #9
0
    def olx_ad_parser(self, url):
        is_ad = True if '/obyavlenie/' in url else False

        self.browser.get(url)
        sleep(5)

        if is_ad:
            phone_button = self.browser.find_element_by_xpath(
                '//div[contains(@class, "contact-button")]')
            phone_button.click()
            sleep(2)

        html_code = self.browser.page_source

        # browser.quit()

        dom_tree = html.fromstring(html_code)
        dom_tree.make_links_absolute(base_url=url)

        if is_ad:
            try:
                ad_name = dom_tree.xpath('//h1')[0].text.strip()
            except Exception as e:
                ad_name = 'not found'

            try:
                ad_price = dom_tree.xpath(
                    '//div[@class="price-label"]')[0].text_content().strip()
            except Exception as e:
                ad_price = 'not found'

            try:
                ad_image = dom_tree.xpath(
                    '//div[@id="photo-gallery-opener"]/img/@src')[0]
            except Exception as e:
                ad_image = 'not found'

            author = dom_tree.xpath(
                '//div[@class="offer-user__details "]/h4/a')
            phone = dom_tree.xpath(
                '//div[contains(@class, "contact-button")]/strong')[0].text

            author = {
                'name': author[0].text.strip(),
                'profile_link': author[0].attrib['href'],
                'phone': phone
            }

            db_author, created = Author.get_or_create(**author)

            categories = dom_tree.xpath('//td[@class="middle"]/ul/li/a')

            db_cats = []

            for n, cat in enumerate(categories):

                if n == 0:
                    parent = 0
                else:
                    parent = db_cats[n - 1].id

                cat = {
                    'name': cat.text_content().strip(),
                    'link': cat.attrib['href'],
                    'parent': parent
                }

                cat, created = Category.get_or_create(**cat)
                db_cats.append(cat)

            print(db_cats)

            ad = {
                'author': db_author,
                'url': url,
                'name': ad_name.strip(),
                'price': ad_price,
                'image': ad_image,
                'date': datetime.now()
            }

            db_ad = Ad.create(**ad)

            for cat in db_cats:
                db_ad.categories.add(cat)

            print(ad)

        links = dom_tree.xpath('//a/@href')
        links = {x for x in links}

        return links