예제 #1
0
def setups():
    """set up classifier if not yet trained"""
    if philosophyfilter.is_ready():
        return
    db.close()
    db.connection(db='test_opp')
    ham = Doc(url='http://umsu.de/papers/magnetism2.pdf')
    ham.load_from_db()
    ham.content = readfile(os.path.join(testdir, 'attitudes.txt'))
    ham.update_db()
    spam = Doc(url='http://umsu.de/papers/spam.pdf')
    spam.load_from_db()
    spam.content = """ 
       Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do
       eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut
       enim ad minim veniam, quis nostrud exercitation ullamco laboris
       nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor
       in reprehenderit in voluptate velit esse cillum dolore eu
       fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
       proident, sunt in culpa qui officia deserunt mollit anim id est
       laborum. 
    """
    spam.update_db()
    cur = db.cursor()
    query = "SELECT cat_id FROM cats WHERE label=%s LIMIT 1"
    cur.execute(query, ('philosophy',))
    cat_id = cur.fetchall()[0]
    query = ("INSERT IGNORE INTO docs2cats (doc_id, cat_id, strength, is_training)"
             "VALUES (%s, %s, %s, %s)")
    cur.execute(query, (ham.doc_id, cat_id, 1, 1))
    cur.execute(query, (spam.doc_id, cat_id, 0, 1))
    philosophyfilter.update()
예제 #2
0
 def update_db(self, **kwargs):
     """
     update self.**kwargs and write present state to db, also set
     'last_checked'
     """
     for k,v in kwargs.items():
         setattr(self, k, v)
     cur = db.cursor()
     self.last_checked = datetime.now()
     fields = [f for f in self.db_fields.keys()
               if f != 'link_id' and getattr(self, f) is not None]
     values = [getattr(self, f) for f in fields]
     if self.link_id:
         query = "UPDATE links SET {},urlhash=MD5(url) WHERE link_id = %s".format(
             ",".join(k+"=%s" for k in fields))
         cur.execute(query, values + [self.link_id])
     else:
         query = "INSERT INTO links ({},urlhash) VALUES ({},MD5(url))".format(
             ",".join(fields), ",".join(("%s",)*len(fields)))
         try:
             cur.execute(query, values)
         except:
             debug(1, "oops, %s: %s", query, ','.join(map(str, values)))
             raise
         self.link_id = cur.lastrowid
     debug(4, cur._last_executed)
     db.commit()
예제 #3
0
 def select_names(self, num_names):
     """return list of num names names from db to check for new papers pages"""
     cur = db.cursor()
     query = "SELECT name FROM author_names WHERE is_name=1 ORDER BY last_searched ASC LIMIT {}".format(num_names)
     cur.execute(query)
     rows = cur.fetchall()
     return [row[0] for row in rows]
예제 #4
0
 def store_page(self, url, name):
     """write page <url> for author <name> to db"""
     cur = db.cursor()
     query = "INSERT INTO sources (status,sourcetype,url,default_author,name,found_date)"
     query += "VALUES (0,'personal',%s,%s,%s, NOW())"
     cur.execute(query, (url,name,"{}'s site".format(name)))
     db.commit()
예제 #5
0
    def is_duplicate(self, url):
        """
        check if page is already in db under superficially different URL:
        with(out) trailing slash or with(out) 'www'.

        One should also check if the same page is available e.g. as
        /user/1076 and as /user/sjones. But that's tricky. Perhaps
        this functionality should be added to process_pages, where I
        could check if a new page contains any links to papers that
        haven't also been found elsewhere and if not mark it as
        inactive. TODO
        """
        cur = db.cursor()
        m = re.match('^(https?://)(www\.)?(.+?)(/)?$', url)
        if not m:
            logger.warn('malformed url %s?', url)
            return None
        urlvars = [
            m.group(1)+m.group(3), # no 'www.', no trailing slash
            m.group(1)+m.group(3)+'/', # no 'www.', trailing slash
            m.group(1)+'www.'+m.group(3), # 'www.', no trailing slash 
            m.group(1)+'www.'+m.group(3)+'/' # 'www.', trailing slash 
        ]
        cur.execute("SELECT url FROM sources WHERE url IN (%s, %s, %s, %s)", urlvars)
        rows = cur.fetchall()
        if rows:
            return rows[0][0]
        return None
예제 #6
0
def journal_names():
    try:
        return journal_names.res
    except AttributeError:
        cur = db.cursor()
        cur.execute("SELECT name FROM journals")
        journal_names.res = cur.fetchall()
        return journal_names.res
예제 #7
0
def categories():
    """returns list of (cat_id,cat_label) pairs from db"""
    try:
        return categories.cats
    except AttributeError:
        cur = db.cursor()
        query = ("SELECT cat_id, label FROM cats WHERE label != 'philosophy' AND label != 'blogspam'")
        cur.execute(query)
        categories.cats = list(cur.fetchall())
        return categories.cats
예제 #8
0
 def assign_category(self, cat_id, strength):
     """inserts or updates a docs2cats entry in the db"""
     if not self.doc_id:
         raise Exception("cannot assign category: document has no id")
     cur = db.cursor()
     query = ("INSERT INTO docs2cats (cat_id, doc_id, strength) VALUES (%s,%s,%s)"
              " ON DUPLICATE KEY UPDATE strength=%s")
     cur.execute(query, (cat_id, self.doc_id, strength, strength))
     debug(4, cur._last_executed)
     db.commit()
예제 #9
0
 def update_db(self, **kwargs):
     """write **kwargs to db, also update 'last_checked'"""
     if self.source_id:
         cur = db.cursor()
         kwargs['last_checked'] = time.strftime('%Y-%m-%d %H:%M:%S') 
         query = "UPDATE sources SET {},urlhash=MD5(url) WHERE source_id = %s".format(
             ",".join(k+"=%s" for k in kwargs.keys()))
         cur.execute(query, tuple(kwargs.values()) + (self.source_id,))
         debug(3, cur._last_executed)
         db.commit()
예제 #10
0
 def save_to_db(self):
     """write object to db"""
     cur = db.cursor()
     fields = [f for f in self.db_fields.keys()
               if f != 'link_id' and getattr(self, f) is not None]
     values = [getattr(self, f) for f in fields]
     query = "INSERT INTO sources ({}, urlhash) VALUES ({}, MD5(url))".format(
         ",".join(fields), ",".join(("%s",)*len(fields)))
     cur.execute(query, values)
     debug(3, cur._last_executed)
     db.commit()
     self.source_id = cur.lastrowid
예제 #11
0
 def find_new_pages(self, name):
     """searches for papers pages matching author name, returns urls of new pages"""
     logger.info("\nsearching papers page(s) for %s", name)
     pages = set()
     search_terms = [
         # careful with google.com: don't block sites.google.com...
         '-site:academia.edu',
         '-site:wikipedia.org',
         '-site:philpapers.org',
         '-filetype:pdf',
         '~philosophy',
         '(publications OR articles OR papers OR "in progress" OR forthcoming)',
     ]
     # search full name first, then last name only:
     search_phrase = '"{}" '.format(name) + ' '.join(search_terms)
     searchresults = set(googlesearch.search(search_phrase))
     search_phrase = '"{}" '.format(name.split()[-1]) + ' '.join(search_terms)
     searchresults |= set(googlesearch.search(search_phrase))
     for url in searchresults:
         logger.debug("\n")
         url = util.normalize_url(url) 
         if self.bad_url(url):
             logger.info("bad url: %s", url)
             continue
         # check if url already known:
         cur = db.cursor()
         cur.execute("SELECT 1 FROM sources WHERE url = %s", (url,))
         rows = cur.fetchall()
         if rows:
             logger.info("%s already known", url)
             continue
         try:
             status, r = util.request_url(url)
             if status != 200:
                 raise Exception('status {}'.format(status))
         except:
             logger.info("cannot retrieve %s", url)
         else:
             score = self.evaluate(r, name)
             if score < 0.7:
                 logger.info("%s doesn't look like a papers page", url)
                 continue
             dupe = self.is_duplicate(url)
             if dupe:
                 logger.info("%s is a duplicate of already known %s", url, dupe)
                 continue
             logger.info("new papers page for %s: %s", name, url)                
             pages.add(url)
     if not pages:
         logger.info("no pages found")
     self.update_author(name)
     return pages
예제 #12
0
def run():
    """
    retrieve and process new blog posts that have been put in the db
    by opp-web:feedhandler
    """
    cur = db.cursor()
    query = "SELECT doc_id FROM docs WHERE doctype = 'blogpost' AND status = 0"
    cur.execute(query)
    debug(4, cur._last_executed)
    posts = cur.fetchall()
    if not posts:
        return debug(3, "no new blog posts")
    for id in posts:
        post = Doc(doc_id=id)
        post.load_from_db()
        process_blogpost(post)
예제 #13
0
파일: test_models.py 프로젝트: wo/opp-tools
def testdb():
    """set up test database"""
    db.close()
    db.connection(db='test_opp')
    cur = db.cursor()
    for t in ('sources', 'links', 'docs'):
        cur.execute('DELETE FROM {}'.format(t))
    db.commit()
    Source(
        url='http://umsu.de/papers/',
        sourcetype='personal',
        status=0,
        last_checked=datetime.now()).save_to_db()
    Source(
        url='http://consc.net/papers.html',
        sourcetype='personal',
        status=1).save_to_db()
예제 #14
0
 def update_db(self, **kwargs):
     """update self.**kwargs and write present state to db"""
     for k, v in kwargs.items():
         setattr(self, k, v)
     cur = db.cursor()
     fields = [f for f in self.db_fields.keys()
               if f != 'doc_id' and getattr(self, f) is not None]
     values = [getattr(self, f) for f in fields]
     if self.doc_id:
         query = "UPDATE docs SET {},urlhash=MD5(url) WHERE doc_id = %s".format(
             ",".join(k+"=%s" for k in fields))
         cur.execute(query, values + [self.doc_id])
     else:
         query = "INSERT INTO docs ({},urlhash) VALUES ({},MD5(url))".format(
             ",".join(fields), ",".join(("%s",)*len(fields)))
         cur.execute(query, values)
         self.doc_id = cur.lastrowid
     debug(4, cur._last_executed)
     db.commit()
예제 #15
0
 def run(self):
     cur = db.cursor()
     findings = []
     for url in self.select_journals():
         logger.debug("looking for author names on %s", url)
         for name in self.get_authornames(url):
             query = "INSERT INTO author_names (name, last_searched) VALUES (%s, '1970-01-01')"
             try:
                 cur.execute(query, (name,))
                 db.commit()
             except MySQLdb.IntegrityError:
                 logger.debug("{} already in db".format(name))
                 findings = [f for f in findings if f[0] != name]
             else:
                 logger.debug("+++ new author name {}".format(name))
                 name_id = cur.lastrowid
                 findings.append((name, name_id, url))
     if findings:
         self.sendmail(findings)
예제 #16
0
#!/usr/bin/env python3
import sys
import logging
import findmodules
from opp import db, debug
from opp.doctyper import classifier

logger = logging.getLogger('opp')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

debug.debuglevel(4)

cur = db.cursor()
query = ("SELECT label FROM cats")
cur.execute(query)
for row in cur.fetchall():
   classifier.update_classifier(row[0])
예제 #17
0
 def update_author(self, name):
     """update last_searched field for author <name>"""
     cur = db.cursor()
     query = "UPDATE author_names SET last_searched=NOW() WHERE name=%s"
     cur.execute(query, (name,))
     db.commit()
예제 #18
0
파일: test_db.py 프로젝트: wo/opp-tools
def test_cursor():
    cur = db.cursor()
    query = "SHOW TABLES"
    cur.execute(query)
    tables = cur.fetchall()
    assert tables is not None
예제 #19
0
def remove_from_db(doc):
    cur = db.cursor()
    query = "DELETE FROM docs WHERE doc_id = %s"
    cur.execute(query, (doc.doc_id,))
    debug(4, cur._last_executed)
    db.commit()