Пример #1
0
    def old_link(self, url):
        """
        If a link to (a session variant of) url is already known on this
        page (as stored in the database), returns the stored Link,
        otherwise returns None.
        """
        if not hasattr(self, '_links'):
            cur = db.dict_cursor()
            query = "SELECT * FROM links WHERE source_id = %s"
            cur.execute(query, (self.source_id,))
            debug(5, cur._last_executed)
            self._links = [ Link(source=self, **li) for li in cur.fetchall() ]
            #debug(2, 'xxx old links:\n%s', '\n'.join([li.url for li in self._links]))

        for li in self._links:
            if li.url == url:
                return li

        s_url = self.strip_session_variables(url)
        if s_url != url:
            for li in self._links:
                if s_url == self.strip_session_variables(li.url):
                    return li

        return None
Пример #2
0
def test_query(caplog):
    cur = db.dict_cursor()
    query = "SELECT * FROM sources"
    #cur.execute(query, (url,))
    cur.execute(query)
    sources = cur.fetchall()
    assert True
Пример #3
0
 def load_from_db(self, url=''):
     url = url or self.url
     if not url:
         raise TypeError("need source url to load Source from db")
     cur = db.dict_cursor()
     query = "SELECT * FROM sources WHERE urlhash = MD5(%s)"
     cur.execute(query, (url,))
     debug(5, cur._last_executed)
     sources = cur.fetchall()
     if sources:
         for k,v in sources[0].items():
             setattr(self, k, v)
     else:
         debug(4, "%s not in sources table", url)
Пример #4
0
 def load_from_db(self, url='', source_id=0):
     url = url or self.url
     source_id = source_id or self.source_id
     if not url or not source_id:
         raise TypeError("need url and source_id to load Link from db")
     
     cur = db.dict_cursor()
     query = "SELECT * FROM links WHERE urlhash = MD5(%s) AND source_id = %s LIMIT 1"
     cur.execute(query, (url, source_id))
     debug(5, cur._last_executed)
     links = cur.fetchall()
     if links:
         for k,v in links[0].items():
             setattr(self, k, v)
     else:
         debug(4, "link to %s not in database", url)
Пример #5
0
def next_source():
    """return the next source from db that's due to be checked"""
    min_age = datetime.now() - timedelta(hours=16)
    min_age = min_age.strftime('%Y-%m-%d %H:%M:%S')
    cur = db.dict_cursor()
    query = ("SELECT * FROM sources WHERE"
             " sourcetype != 'blog'" # ignore rss feeds
             " AND (last_checked IS NULL OR last_checked < %s)"
             " ORDER BY last_checked LIMIT 1")
    cur.execute(query, (min_age,))
    debug(4, cur._last_executed)
    sources = cur.fetchall()
    if sources:
        return Source(**sources[0])
    else:
        debug(1, "all pages recently checked")
        return None
Пример #6
0
def update():
    """
    re-train classifier; the training corpus is taken from the database.
    """
    debug(3, "re-training philosophy classifier")
    cur = db.dict_cursor()
    query = "SELECT cat_id FROM cats WHERE label=%s LIMIT 1"
    cur.execute(query, ('philosophy',))
    cat_id = cur.fetchall()[0]['cat_id']
    query = ("SELECT D.*, M.strength"
             " FROM docs D, docs2cats M"
             " WHERE M.doc_id = D.doc_id AND M.cat_id = %s AND M.is_training = 1")
    cur.execute(query, (cat_id,))
    debug(4, cur._last_executed)
    rows = cur.fetchall()
    if not rows:
        raise Exception('no training documents for philosophy classifier')
    docs = [Doc(**row) for row in rows]
    classes = [row['strength'] for row in rows]
    clf.train(docs, classes)
    clf.save()
Пример #7
0
 def load_from_db(self, doc_id=None, url=None):
     doc_id = doc_id or self.doc_id
     url = url or self.url
     cur = db.dict_cursor()
     if doc_id:
         query = "SELECT * FROM docs WHERE doc_id = %s"
         cur.execute(query, (doc_id,))
     elif url:
         query = "SELECT * FROM docs WHERE urlhash = MD5(%s)"
         cur.execute(query, (url,))
     else:
         raise TypeError("need doc_id or url to load doc from db")
     debug(5, cur._last_executed)
     docs = cur.fetchall()
     if docs:
         for k,v in docs[0].items():
             setattr(self, k, v)
         return True
     else:
         debug(4, "no doc with id %s or url %s in database", doc_id, url)
         return False
Пример #8
0
def get_duplicate(doc):
    """
    returns a document from db that closely resembles doc, or None
    """
    # This is non-trivial because duplicates can have slightly
    # different titles (e.g. with and without <i>), different
    # filesize and wordcount (manuscript vs published version),
    # different authors and abstracts (due to parser mistakes,
    # author name variants, etc.).
    debug(5, "checking for duplicates")
    where = ['doc_id != %s']
    values = [doc.doc_id]
    m = re.search('\w+', doc.title) # first title word
    if m:
        where.append('title LIKE %s') 
        values.append('%'+m.group()+'%')
    m = re.search('(\w+)(?:,|$)', doc.authors) # first author surname
    if m:
        where.append('authors LIKE %s')
        values.append('%'+m.group(1)+'%')
    cur = db.dict_cursor()
    query = "SELECT * FROM docs WHERE " + (' AND '.join(where))
    cur.execute(query, values)
    debug(5, cur._last_executed)
    dupes = cur.fetchall()
    for dupe in dupes:
        debug(5, "candidate: %s, '%s'", dupe['authors'], dupe['title'])
        if abs(doc.numwords - dupe['numwords']) / doc.numwords > 0.2:
            debug(5, "length not close enough")
            continue
        sm = SequenceMatcher(None, doc.content, dupe['content'])
        match_ratio = sm.ratio()
        if match_ratio < 0.1: # sic
            debug(5, "content too different, ratio %s", match_ratio)
            continue
        debug(4, "duplicate: %s, '%s'", dupe['authors'], dupe['title'])
        return Doc(**dupe)
    return None
Пример #9
0
logger = logging.getLogger('opp')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

ap = argparse.ArgumentParser()
ap.add_argument('url', help='(part of) url of source page to scrape')
ap.add_argument('-d', '--debug_level', default=1, type=int)
ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files')
ap.add_argument('-l', '--link', type=str, help='only process this link')
args = ap.parse_args()

debug.debuglevel(args.debug_level)

cur = db.dict_cursor()
query = "SELECT * FROM sources WHERE url LIKE %s LIMIT 1"
cur.execute(query, ('%'+args.url+'%',))
sources = cur.fetchall()
if not sources:
   raise Exception(args.url+' not in sources table')
source = scraper.Source(**sources[0])

if args.link:
    browser = scraper.Browser(use_virtual_display=True)
    browser.goto(source.url)
    source.set_html(browser.page_source)
    try:
        el = browser.find_element_by_xpath("//a[contains(@href, '{}')]".format(args.link))
    except Exception as e:
        sys.exit('no link containing '+args.link+' on '+source.url)