Exemplo n.º 1
0
    def __init__(self,
                 data_folder,
                 file_indices,
                 subj_data,
                 normalization="none"):
        """
        Args:
          data_folder : str
            The root folder of the preprocessed data.
          file_indices : Dict[int->int]
            Converts linear indices useful to iterate through the dataset
            into keys for the `subj_data` structure.
          subj_data : dict
            Information about the file location of each sample
          normalization : str
            The type of normalization to use for the data. This can be either
            standard, none or val. val should only be used if this is a
            validation dataset and the statistics are extracted from the
            training set.
        """
        super(EEGDataset2, self).__init__()
        if normalization not in EEGDataset2.all_normalizations:
            raise ValueError(f"Normalization must be in {all_normalizations}.")

        self.normalization = normalization
        self.data_folder = data_folder

        self.xfile_cache = LRUCache(capacity=50)
        self.yfile_cache = LRUCache(capacity=500)
        self.subj_data = subj_data
        self.file_indices = file_indices

        self.init_normalizer()
Exemplo n.º 2
0
    def __init__(self,
                 data_folder,
                 file_indices,
                 subj_data,
                 transformation="none",
                 super_node=False):
        """
        Args:
          data_folder : str
            The root folder of the preprocessed data.
          file_indices : Dict[int->int]
            Converts linear indices useful to iterate through the dataset
            into keys for the `subj_data` structure.
          subj_data : dict
            Information about the file location of each sample
        """
        super(EEGDataset1, self).__init__()
        if transformation not in all_transformations:
            raise ValueError(
                f"Transformation must be in {all_transformations}.")

        self.super_node = super_node
        self.transformation = transformation
        self.data_folder = data_folder
        self.num_nodes = 90

        self.xfile_cache = LRUCache(capacity=50)
        self.yfile_cache = LRUCache(capacity=500)
        self.subj_data = subj_data
        self.file_indices = file_indices
Exemplo n.º 3
0
 def test_lru_cache_cleaning(self):
     cache = LRUCache(capacity=2, connect=False)
     cache.setRedisConn(self.redis, cache_name='lrucache')
     cache.put('1', '1')
     cache.put('2', '2')
     self.assertEqual(cache.get('1'), '1')
     cache.clearCache()
     self.assertEqual(cache.get('2'), -1)
     self.assertEqual(cache.get('1'), -1)
Exemplo n.º 4
0
    def test_size(self):
        cache = LRUCache(size=5, expires=1000, region='Montreal')
        users = generate_content(n=10)
        for user in users:
            cache.set(user)

        # test max capacity of cache
        self.assertEqual(len(cache), 5)

        # users == 10 but we dont have 10 items in cache
        self.assertNotEqual(len(cache), 10)
Exemplo n.º 5
0
 def test_lru_cache_behavior_with_specific_expiration(self):
     cache = LRUCache(capacity=2, connect=False)
     cache.setRedisConn(self.redis, cache_name='lruwithexp')
     cache.put('2', '2', ttl=1)
     cache.put(
         '1',
         '1',
     )
     self.assertEqual(cache.get('1'), '1')
     time.sleep(1.1)
     self.assertEqual(cache.get('2'), -1)
     self.assertEqual(cache.get('1'), '1')
Exemplo n.º 6
0
 def test_disconnected_cache(self):
     cache = LRUCache(capacity=1, connect=False)
     with self.assertRaises(NameError):
         cache.get('1')
     with self.assertRaises(redis.exceptions.ConnectionError):
         cache.connectCache()
         cache.get('1')
Exemplo n.º 7
0
class LRUCache_test_multithreading_behaviour(unittest.TestCase):
    '''
        Test Suite to verify the multithread working
    '''
    def setUp(self):
        def thread_1(c):
            c.lock.acquire()
            time.sleep(0.01)
            c.lock.release()

        def thread_2(c):
            c.set(1, 10)

        def thread_3(c):
            c.get(1)

        self.c = LRUCache(10, 5)
        self.t1 = Thread(target=thread_1, args=[self.c])
        self.t2 = Thread(target=thread_2, args=[self.c])
        self.t3 = Thread(target=thread_3, args=[self.c])

    def tearDown(self):
        del self.c
        del self.t1
        del self.t2
        del self.t3

    def test_lockSet(self):
        '''
            Lock on LRUCache.set method
        '''
        self.t1.start()
        self.t2.start()
        self.assertEqual(self.t2.is_alive(), True)
        self.assertEqual(self.t1.is_alive(), True)
        self.t2.join()
        self.assertEqual(self.c.head.value, 10)

    def test_lockGet(self):
        '''
            Lock on LRUCache.get method
        '''
        self.c.set(1, 10)
        self.t1.start()
        self.t3.start()
        self.assertEqual(self.t3.is_alive(), True)
        self.assertEqual(self.t1.is_alive(), True)
        self.assertEqual(self.c.head.ttl, 5)
        self.t3.join()
        self.assertEqual(self.c.head.ttl, 4)
Exemplo n.º 8
0
 def test_excpetionLRUCache(self):
     with self.assertRaisesRegex(
             ValueError, "The capacity has to be an integer bigger than 0"):
         LRUCache("1")
     with self.assertRaisesRegex(
             ValueError, "The capacity has to be an integer bigger than 0"):
         LRUCache(0)
     with self.assertRaisesRegex(
             ValueError,
             "The time to live has to be an integer bigger than 0"):
         LRUCache(ttl="1")
     with self.assertRaisesRegex(
             ValueError,
             "The time to live has to be an integer bigger than 0"):
         LRUCache(ttl=0)
Exemplo n.º 9
0
    def setUp(self):
        def thread_1(c):
            c.lock.acquire()
            time.sleep(0.01)
            c.lock.release()

        def thread_2(c):
            c.set(1, 10)

        def thread_3(c):
            c.get(1)

        self.c = LRUCache(10, 5)
        self.t1 = Thread(target=thread_1, args=[self.c])
        self.t2 = Thread(target=thread_2, args=[self.c])
        self.t3 = Thread(target=thread_3, args=[self.c])
Exemplo n.º 10
0
 def setUp(self):
     self.c0 = LRUCache()
     self.c1 = LRUCache(10, 5)
Exemplo n.º 11
0
from math import ceil

from autocorrect import spell
from beaker.middleware import SessionMiddleware
from bottle import route, run, request, static_file, redirect, app, template, error

from cache import LRUCache

# store dictionary user history
global_user_history = dict()
global_user_recent = dict()
global_dict = Counter()
recent_history = deque(maxlen=10)
global_suggest = []
# Initialize custom LRU Cache with a capacity of 10000 search results.
global_search_cache = LRUCache(10000)

base_url = ""

session_opts = {
    'session.type': 'file',
    'session.cookie_expires': 300,
    'session.data_dir': './data',
    'session.auto': True
}
app = SessionMiddleware(app(), session_opts)


@route('/')
def home():
    """home page of web application"""
Exemplo n.º 12
0
from typing import Optional
import uvicorn
from fastapi import FastAPI
from news_api.newsapi_service import NewsAPIService
from reddit_api.reddit_service import RedditAPIService
from cache import LRUCache
from config import Config
from fastapi.middleware.cors import CORSMiddleware


# FAST API app
app = FastAPI()

# Initializing LRUCache object
lru_cache = LRUCache(Config.CAPACITY)

# Allowed Origin
origins = Config.ALLOWED_HOST

# Add CORS Middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["get"],
    allow_headers=["*"],
)


@app.get("/")
def read_root() -> str:
Exemplo n.º 13
0
class TestLRUCache(unittest.TestCase):
    def setUp(self):
        self.cache1 = LRUCache(100)
        self.cache2 = LRUCache(1)
        self.cache3 = LRUCache(2)
        self.cache4 = LRUCache(1)
        self.cache5 = LRUCache(2)

    def test_init(self):
        self.assertRaises(ValueError, LRUCache, 0)
        self.assertRaises(ValueError, LRUCache, -100)

    def test_get(self):
        self.cache2.set('1', '1')
        self.assertEqual(self.cache2.get('1'), '1')
        self.cache2.set('2', '2')
        self.assertEqual(self.cache2.get('1'), '')
        self.assertEqual(self.cache2.get('2'), '2')

        self.cache3.set('1', '1')
        self.cache3.set('2', '2')
        self.assertEqual(self.cache3.get('1'), '1')
        self.cache3.set('3', '3')
        self.assertEqual(self.cache3.get('1'), '1')
        self.assertEqual(self.cache3.get('2'), '')

    def test_set(self):
        self.cache4.set('1', '1')
        self.assertEqual(self.cache4.get('1'), '1')

    def test_delete(self):
        self.cache5.set('1', '1')
        self.cache5.set('2', '2')
        self.cache5.delete('1')
        self.assertEqual(self.cache5.get('1'), '')
        self.assertEqual(self.cache5.get('2'), '2')
        self.assertRaises(KeyError, self.cache5.delete, '1')
        self.assertRaises(KeyError, self.cache5.delete, '3')

    def test_from_task(self):
        self.cache1.set('Jesse', 'Pinkman')
        self.cache1.set('Walter', 'White')
        self.cache1.set('Jesse', 'James')
        self.assertEqual(self.cache1.get('Jesse'), 'James')
        self.cache1.delete('Walter')
        self.assertEqual(self.cache1.get('Walter'), '')
Exemplo n.º 14
0
    def test_expiration(self):
        cache = LRUCache(size=1000, expires=3, region='Montreal')
        content = generate_content(n=10)
        for item in content:
            cache.set(item)

        self.assertEqual(len(cache), 10)
        time.sleep(3)
        cache.remove_expired()
        self.assertTrue(len(cache) == 0)

        # remove expired items
        item1 = content[0]
        cache.set(item1)
        time.sleep(3)
        for item in content[9:]:
            cache.set(item)
        cache.remove_expired()
        self.assertEqual(len(cache), 1)
Exemplo n.º 15
0
class LRUCache_test_logic(unittest.TestCase):
    '''
        Test Suite to verify the cache logic
    '''
    def setUp(self):
        self.c0 = LRUCache()
        self.c1 = LRUCache(5, 3)

    def tearDown(self):
        del self.c0
        del self.c1

    def test_insertItem(self):
        '''
            Verify correct setting of the node and the LRU moves
        '''
        # Insert one node and verify all the fields
        self.c0.set(1, 10, 2)
        self.assertEqual(self.c0.cache_nodes, 1)
        self.assertEqual(self.c0.head, self.c0.linkdict[1])
        self.assertEqual(self.c0.head, self.c0.tail)
        self.assertEqual(self.c0.head.left, None)
        self.assertEqual(self.c0.head.right, None)
        self.assertEqual(self.c0.head.key, 1)
        self.assertEqual(self.c0.head.value, 10)
        self.assertEqual(self.c0.head.ttl, 2)

        # Insert first node and verify the ttl default
        self.c1.set(1, 10)
        first_node = self.c1.head
        self.assertEqual(self.c1.head, self.c1.linkdict[1])
        self.assertEqual(self.c1.head.ttl, self.c1.cache_ttl)
        self.assertEqual(self.c1.tail, first_node)

        # Insert second node and verify the ttl default and the correct link
        # list 2 -> 1
        self.c1.set(2, 20, 1)
        second_node = self.c1.head
        self.assertEqual(self.c1.head, self.c1.linkdict[2])
        self.assertEqual(self.c1.head.ttl, 1)
        self.assertEqual(self.c1.head.left, None)
        self.assertEqual(self.c1.head.right, first_node)
        self.assertEqual(self.c1.tail, first_node)

        # Add third node
        # list 3 -> 2 -> 1
        self.c1.set(3, 30)
        self.assertEqual(self.c1.head, self.c1.linkdict[3])
        self.assertEqual(self.c1.head.left, None)
        self.assertEqual(self.c1.head.right, second_node)
        self.assertEqual(second_node.left, self.c1.head)
        self.assertEqual(second_node.right, first_node)
        self.assertEqual(first_node.left, second_node)
        self.assertEqual(first_node.right, None)

        self.assertEqual(self.c1.cache_nodes, 3)

    def test_updateItem(self):
        '''
            Adding same key causes update and move to head of the list
        '''
        self.c1.set(1, 10)
        first_node = self.c1.head
        self.c1.set(2, 20)
        second_node = self.c1.head
        self.c1.set(1, 30, 5)
        # list 1 -> 2
        self.assertEqual(self.c1.head, first_node)
        self.assertEqual(self.c1.head.key, 1)
        self.assertEqual(self.c1.head.value, 30)
        self.assertEqual(self.c1.head.ttl, 5)
        self.assertEqual(self.c1.head.left, None)
        self.assertEqual(self.c1.head.right, second_node)
        self.assertEqual(second_node.left, first_node)
        self.assertEqual(second_node.right, None)

        self.assertEqual(self.c1.cache_nodes, 2)

    def test_getItem(self):
        '''
            Get an item causes either the move to head of the list and change of ttl or the eviction per ttl
        '''
        self.c0.set(1, 10, 2)
        n = self.c0.get(1)
        self.assertEqual(self.c0.head.key, 1)
        self.assertEqual(self.c0.head.ttl, 1)
        self.assertEqual(n, 10)
        n = self.c0.get(2)
        self.assertEqual(n, None)

        self.c1.set(1, 10)
        first_node = self.c1.head
        self.c1.set(2, 20, 5)
        second_node = self.c1.head
        self.c1.set(3, 30, 6)
        third_node = self.c1.head

        # pick from the tail 3 -> 2 -> 1
        n = self.c1.get(1)
        # list 1 -> 3 -> 2
        self.assertEqual(n, 10)
        self.assertEqual(self.c1.head, first_node)
        self.assertEqual(self.c1.tail, second_node)
        self.assertEqual(self.c1.head.left, None)
        self.assertEqual(self.c1.head.right, third_node)
        self.assertEqual(self.c1.tail.right, None)

        # pick from the middle 1 -> 3 -> 2
        n = self.c1.get(3)
        # list 3 -> 1 -> 2
        self.assertEqual(n, 30)
        self.assertEqual(self.c1.head, third_node)
        self.assertEqual(self.c1.head.right, first_node)
        self.assertEqual(self.c1.head.left, None)
        self.assertEqual(self.c1.tail, second_node)
        self.assertEqual(self.c1.tail.right, None)
        self.assertEqual(self.c1.tail.left, first_node)
        self.assertEqual(first_node.right, second_node)
        self.assertEqual(first_node.left, third_node)

    def test_evictionLRU(self):
        '''
            If the cache is full, the tail has to be evicted
        '''
        for i in range(1, 7):
            self.c1.set(i, 10 * i)

        # list 6 -> 5 -> 4 -> 3 -> 2
        self.assertEqual(self.c1.tail.key, 2)
        n = self.c1.get(1)
        self.assertEqual(n, None)

    def test_evictionTTL(self):
        '''
            If one value is read over ttl times, it will be evicted
        '''

        self.c1.set(1, 10, 1)
        self.c1.set(2, 20, 1)
        for i in range(3, 6):
            self.c1.set(i, 10 * i)

        # list 5 -> 4 -> 3 -> 2 -> 1
        # eviction any node
        n = self.c1.get(2)
        self.assertEqual(self.c1.head.key, 5)
        self.assertEqual(self.c1.head.left, None)
        self.assertEqual(self.c1.head.right.key, 4)
        self.assertEqual(self.c1.tail.key, 1)
        self.assertEqual(self.c1.tail.right, None)
        n = self.c1.get(2)
        self.assertEqual(n, None)

        # list 5 -> 4 -> 3 -> 1
        # eviction tail node
        n = self.c1.get(1)
        self.assertEqual(self.c1.head.key, 5)
        self.assertEqual(self.c1.head.left, None)
        self.assertEqual(self.c1.head.right.key, 4)
        self.assertEqual(self.c1.tail.key, 3)
        self.assertEqual(self.c1.tail.right, None)
        n = self.c1.get(1)
        self.assertEqual(n, None)

        # list 5 -> 4 -> 3
        for i in range(3):
            n = self.c1.get(4)
        self.assertEqual(n, 40)
        n = self.c1.get(4)
        self.assertEqual(n, None)
        self.assertEqual(self.c1.head.key, 5)
        self.assertEqual(self.c1.tail.key, 3)
        self.assertEqual(self.c1.head.right.key, 3)
        self.assertEqual(self.c1.head.left, None)
        self.assertEqual(self.c1.tail.left.key, 5)
        self.assertEqual(self.c1.tail.right, None)

        self.assertEqual(self.c1.cache_nodes, 2)
Exemplo n.º 16
0
class crawler(object):
    """Represents 'Googlebot'. Populates a database by crawling and indexing
    a subset of the Internet.

    This crawler keeps track of font sizes and makes it simpler to manage word
    ids and document ids."""
    def __init__(self, db_conn, url_file):
        """Initialize the crawler with a connection to the database to populate
        and with the file containing the list of seed URLs to begin indexing."""
        self._url_queue = []
        self.db_conn = db_conn
        self._liteMode = 1
        self._memory_cap = 50000
        self._doc_id_cache = LRUCache(self._memory_cap)
        self._word_id_cache = LRUCache(self._memory_cap)
        self._inverted_index = {}
        # Map the doc_id of each webpage to the page title and a short description.
        self._document_index = defaultdict(lambda: ["", ""])

        #for page rank
        self._relation = []
        self._curr_relation = []

        # functions to call when entering and exiting specific tags
        self._enter = defaultdict(lambda *a, **ka: self._visit_ignore)
        self._exit = defaultdict(lambda *a, **ka: self._visit_ignore)

        # add a link to our graph, and indexing info to the related page
        self._enter['a'] = self._visit_a

        # record the currently indexed document's title an increase
        # the font size
        def visit_title(*args, **kargs):
            self._visit_title(*args, **kargs)
            self._increase_font_factor(7)(*args, **kargs)

        # increase the font size when we enter these tags
        self._enter['b'] = self._increase_font_factor(2)
        self._enter['strong'] = self._increase_font_factor(2)
        self._enter['i'] = self._increase_font_factor(1)
        self._enter['em'] = self._increase_font_factor(1)
        self._enter['h1'] = self._increase_font_factor(7)
        self._enter['h2'] = self._increase_font_factor(6)
        self._enter['h3'] = self._increase_font_factor(5)
        self._enter['h4'] = self._increase_font_factor(4)
        self._enter['h5'] = self._increase_font_factor(3)
        self._enter['title'] = visit_title

        # decrease the font size when we exit these tags
        self._exit['b'] = self._increase_font_factor(-2)
        self._exit['strong'] = self._increase_font_factor(-2)
        self._exit['i'] = self._increase_font_factor(-1)
        self._exit['em'] = self._increase_font_factor(-1)
        self._exit['h1'] = self._increase_font_factor(-7)
        self._exit['h2'] = self._increase_font_factor(-6)
        self._exit['h3'] = self._increase_font_factor(-5)
        self._exit['h4'] = self._increase_font_factor(-4)
        self._exit['h5'] = self._increase_font_factor(-3)
        self._exit['title'] = self._increase_font_factor(-7)

        # never go in and parse these tags
        self._ignored_tags = set([
            'meta',
            'script',
            'link',
            'meta',
            'embed',
            'iframe',
            'frame',
            'noscript',
            'object',
            'svg',
            'canvas',
            'applet',
            'frameset',
            'textarea',
            'style',
            'area',
            'map',
            'base',
            'basefont',
            'param',
        ])

        # set of words to ignore
        self._ignored_words = set([
            '',
            'the',
            'of',
            'at',
            'on',
            'in',
            'is',
            'it',
            'a',
            'b',
            'c',
            'd',
            'e',
            'f',
            'g',
            'h',
            'i',
            'j',
            'k',
            'l',
            'm',
            'n',
            'o',
            'p',
            'q',
            'r',
            's',
            't',
            'u',
            'v',
            'w',
            'x',
            'y',
            'z',
            'and',
            'or',
        ])

        # TODO remove me in real version
        self._mock_next_doc_id = 1
        self._mock_next_word_id = 1

        # keep track of some info about the page we are currently parsing
        self._curr_depth = 0
        self._curr_url = ""
        self._curr_doc_id = 0
        self._font_size = 0
        self._curr_words = None

        # get all urls into the queue
        try:
            with open(url_file, 'r') as f:
                for line in f:
                    self._url_queue.append((self._fix_url(line.strip(),
                                                          ""), 0))
        except IOError:
            pass

        # When initializing, by default crawl with a depth of 1.
        self.crawl(depth=1)

    # TODO remove me in real version
    def _mock_insert_document(self, url):
        """A function that pretends to insert a url into a document db table
        and then returns that newly inserted document's id."""
        ret_id = self._mock_next_doc_id
        self._mock_next_doc_id += 1
        return ret_id

    # TODO remove me in real version
    def _mock_insert_word(self, word):
        """A function that pretends to inster a word into the lexicon db table
        and then returns that newly inserted word's id."""
        ret_id = self._mock_next_word_id
        self._mock_next_word_id += 1
        return ret_id

    def word_id(self, word):
        """Get the word id of some specific word."""
        word_id_cached = self._word_id_cache.get(word)
        if word_id_cached != None:
            return word_id_cached
        elif not self._liteMode:
            con = lite.connect(self.db_conn)
            cur = con.cursor()
            cur.execute(
                'CREATE TABLE IF NOT EXISTS lexicon(wordid INTEGER PRIMARY KEY, word text)'
            )
            cur.execute('SELECT * FROM lexicon WHERE word = ?', (word, ))
            result = cur.fetchone()
            con.close()
            if result != () and result != None:
                return result[0]

        # TODO: 1) add the word to the lexicon, if that fails, then the
        #          word is in the lexicon
        #       2) query the lexicon for the id assigned to this word,
        #          store it in the word id cache, and return the id.

        word_id = self._mock_insert_word(word)
        evict = self._word_id_cache.set(word, word_id)
        if evict != None:
            try:
                con = lite.connect(self.db_conn)
                cur = con.cursor()
                cur.execute(
                    'CREATE TABLE IF NOT EXISTS lexicon(wordid INTEGER PRIMARY KEY, word text)'
                )
                cur.execute('INSERT INTO lexicon VALUES (?, ?)',
                            (evict[1], evict[0]))
                con.commit()
                con.close()
            except lite.IntegrityError as e:
                print "can't insert into db...", e
                if "UNIQUE" in str(e):
                    pass
        return word_id

    def document_id(self, url):
        """Get the document id for some url."""
        doc_id_cached = self._doc_id_cache.get(url)
        if doc_id_cached != None:
            return doc_id_cached
        elif not self._liteMode:
            con = lite.connect(self.db_conn)
            cur = con.cursor()
            cur.execute(
                'CREATE TABLE IF NOT EXISTS docIndex(docid INTEGER PRIMARY KEY, url text)'
            )
            cur.execute('SELECT * FROM docIndex WHERE url = ?', (url, ))
            result = cur.fetchone()
            con.close()
            if result != () and result != None:
                return result[0]

        # TODO: just like word id cache, but for documents. if the document
        #       doesn't exist in the db then only insert the url and leave
        #       the rest to their defaults.

        doc_id = self._mock_insert_document(url)
        evict = self._doc_id_cache.set(url, doc_id)
        if evict != None:
            try:
                con = lite.connect(self.db_conn)
                cur = con.cursor()
                cur.execute(
                    'CREATE TABLE IF NOT EXISTS docIndex(docid INTEGER PRIMARY KEY, url text)'
                )
                cur.execute('INSERT INTO docIndex VALUES (?, ?)',
                            (evict[1], evict[0]))
                con.commit()
                con.close()
            except lite.IntegrityError as e:
                print "can't insert into db..."
                if "UNIQUE" in str(e):
                    pass
        return doc_id

    def _fix_url(self, curr_url, rel):
        """Given a url and either something relative to that url or another url,
        get a properly parsed url."""

        rel_l = rel.lower()
        if rel_l.startswith("http://") or rel_l.startswith("https://"):
            curr_url, rel = rel, ""

        # compute the new url based on import
        curr_url = urlparse.urldefrag(curr_url)[0]
        parsed_url = urlparse.urlparse(curr_url)
        return urlparse.urljoin(parsed_url.geturl(), rel)

    def add_link(self, from_doc_id, to_doc_id):
        """Add a link into the database, or increase the number of links between
        two pages in the database."""
        # TODO

    def _visit_title(self, elem):
        """Called when visiting the <title> tag."""
        title_text = self._text_of(elem).strip()
        print "document title=" + repr(title_text)
        self._document_index[self._curr_doc_id][0] = title_text
        # TODO update document title for document id self._curr_doc_id

    def _visit_a(self, elem):
        """Called when visiting <a> tags."""

        dest_url = self._fix_url(self._curr_url, attr(elem, "href"))
        # print "href="+repr(dest_url), \
        #      "title="+repr(attr(elem,"title")), \
        #      "alt="+repr(attr(elem,"alt")), \
        #      "text="+repr(self._text_of(elem))

        # add the just found URL to the url queue
        self._url_queue.append((dest_url, self._curr_depth))
        self._curr_relation.append(dest_url)

        # add a link entry into the database from the current document to the
        # other document
        self.add_link(self._curr_doc_id, self.document_id(dest_url))

        # TODO add title/alt/text to index for destination url

    def _add_words_to_document(self):
        # TODO: knowing self._curr_doc_id and the list of all words and their
        #       font sizes (in self._curr_words), add all the words into the
        #       database for this document
        print "    num words=" + str(len(self._curr_words))

    def _increase_font_factor(self, factor):
        """Increade/decrease the current font size."""
        def increase_it(elem):
            self._font_size += factor

        return increase_it

    def _visit_ignore(self, elem):
        """Ignore visiting this type of tag"""
        pass

    def _add_text(self, elem):
        """Add some text to the document. This records word ids and word font sizes
        into the self._curr_words list for later processing."""
        words = WORD_SEPARATORS.split(elem.string.lower())
        for word in words:
            word = word.strip()
            if word in self._ignored_words:
                continue
            self._curr_words.append((self.word_id(word), self._font_size))

    def _text_of(self, elem):
        """Get the text inside some element without any tags."""
        if isinstance(elem, Tag):
            text = []
            for sub_elem in elem:
                text.append(self._text_of(sub_elem))

            return " ".join(text)
        else:
            return elem.string

    def _index_document(self, soup):
        """Traverse the document in depth-first order and call functions when entering
        and leaving tags. When we come across some text, add it into the index. This
        handles ignoring tags that we have no business looking at."""
        class DummyTag(object):
            next = False
            name = ''

        class NextTag(object):
            def __init__(self, obj):
                self.next = obj

        tag = soup.html
        stack = [DummyTag(), soup.html]
        text_line = 0

        while tag and tag.next:
            tag = tag.next

            # html tag
            if isinstance(tag, Tag):

                if tag.parent != stack[-1]:
                    self._exit[stack[-1].name.lower()](stack[-1])
                    stack.pop()

                tag_name = tag.name.lower()

                # ignore this tag and everything in it
                if tag_name in self._ignored_tags:
                    if tag.nextSibling:
                        tag = NextTag(tag.nextSibling)
                    else:
                        self._exit[stack[-1].name.lower()](stack[-1])
                        stack.pop()
                        tag = NextTag(tag.parent.nextSibling)

                    continue

                # enter the tag
                self._enter[tag_name](tag)
                stack.append(tag)

            # text (text, cdata, comments, etc.)
            else:
                self._add_text(tag)
                text = tag.string.lower()
                # Use first three non-empty lines in a page as page description.
                if text_line < 3 and text.strip():
                    self._document_index[self._curr_doc_id][1] += text
                    text_line += 1

    def _populate_inverted_index(self):
        """Populate the inverted index.

         For each word_id encountered in the current document, add the current
         document ID to the set of documents that contain the word.
        """
        if self._liteMode:
            #print self._curr_words
            for word, _ in self._curr_words:
                if word not in self._inverted_index:
                    self._inverted_index[word] = set()
                self._inverted_index[word].add(self._curr_doc_id)
        else:
            for word, _ in self._curr_words:
                con = lite.connect(self.db_conn)
                cur = con.cursor()
                cur.execute(
                    'CREATE TABLE IF NOT EXISTS invertedIndex(wordid INTEGER, docid INTEGER)'
                )
                cur.execute('INSERT INTO invertedIndex VALUES (?, ?)',
                            (word, self._curr_doc_id))
            con.commit()
            con.close()

    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            doc_id = self.document_id(url)

            # we've already seen this document
            if doc_id in seen:
                continue

            seen.add(doc_id)  # mark this document as haven't been visited

            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._curr_doc_id = doc_id
                self._font_size = 0
                self._curr_words = []
                self._index_document(soup)
                self._add_words_to_document()
                self._populate_inverted_index()

                #build self._relation
                for item in self._curr_relation:
                    self._relation.append((self._curr_url, item))
                self._curr_relation = []

            except Exception as e:
                print e
                pass
            finally:
                if socket:
                    socket.close()

        self.insertdatabase()
        self._word_id_cache = {}
        self._doc_id_cache = {}
        self.get_page_rank()

    def get_doc_id_cache(self):
        con = lite.connect(self.db_conn)
        cur = con.cursor()
        cur.execute('SELECT * FROM docIndex')
        result = cur.fetchall()
        dic = {}
        for item in result:
            dic[item[1]] = item[0]
        con.close()
        return dic

    def get_inverted_doc_id_cache(self):
        con = lite.connect(self.db_conn)
        cur = con.cursor()
        cur.execute('SELECT * FROM docIndex')
        result = cur.fetchall()
        dic = {}
        for item in result:
            dic[item[0]] = item[1]
        con.close()
        return dic

    def get_inverted_word_id_cache(self):
        con = lite.connect(self.db_conn)
        cur = con.cursor()
        cur.execute('SELECT * FROM lexicon')
        result = cur.fetchall()
        dic = {}
        for item in result:
            dic[item[0]] = item[1]
        con.close()
        return dic

    def get_inverted_index(self):
        """Retrieves an inverted index for crawled pages.

        Returns:
            A dict mapping each encountered word to the set of documents where
            they are found, in the form {word_id: set(doc_id1, doc_id2, ...)}.
        """
        if self._liteMode:
            return self._inverted_index
        else:
            con = lite.connect(self.db_conn)
            cur = con.cursor()
            cur.execute('SELECT * FROM invertedIndex')
            result = cur.fetchall()
            dic = {}
            for item in result:
                if item[0] not in dic:
                    dic[item[0]] = set()
                dic[item[0]].add(item[1])
            #print dic
            con.close()
            return dic

    def get_resolved_inverted_index(self):
        """Retrieves an inverted index for crawled pages with word IDs and doc
        IDs resolved to words and URLs.

        Returns:
            A dict mapping each encountered word to the set of documents where
            they are found, in the form {word: set(url1, url2, ...)}.
        """

        #inverted_index = self._inverted_index
        inverted_index = self.get_inverted_index()
        inverted_doc_id = self.get_inverted_doc_id_cache()
        inverted_word_id = self.get_inverted_word_id_cache()
        resolved_inverted_index = {}
        for word_id, doc_id_set in inverted_index.items():
            word = inverted_word_id[word_id]
            url_set = set()
            for doc_id in doc_id_set:
                url_set.add(inverted_doc_id[doc_id])
            resolved_inverted_index[word] = url_set
        return resolved_inverted_index

    def get_page_rank(self):
        # get the rank score of websites and write them into database table and print each row of the table
        relation = []
        doc_id_cache = self.get_doc_id_cache()
        # self.relation is a list of tuples generated by crawler, which the first element in each tuple is the from url and second element is the to url
        for item in self._relation:
            # convert the urls to doc_ids to match the format of page_rank function
            fromid = doc_id_cache[item[0]]
            toid = doc_id_cache[item[1]]
            relation.append((fromid, toid))
        # call page_rank function to calculate scores and returns a defaultdic
        pr = self.page_rank(relation)

        # insert the rankscore to pageRank table in database
        con = lite.connect(self.db_conn)
        cur = con.cursor()
        cur.execute(
            'CREATE TABLE IF NOT EXISTS pageRank(docid INTEGER PRIMARY KEY, score real)'
        )
        for item in pr:
            score = pr[item]
            cur.execute('INSERT INTO pageRank VALUES (?, ?)', (item, score))
        cur.execute('SELECT * FROM pageRank')
        #print "pageRank Table:"
        #print "[docid,   score   ]"
        #print each row of the pageRank table in the database.
        #for row in cur:
        #print row
        con.commit()
        con.close()
        return pr

    def page_rank(self, links, num_iterations=20, initial_pr=1.0):
        from collections import defaultdict
        import numpy as np

        page_rank = defaultdict(lambda: float(initial_pr))
        num_outgoing_links = defaultdict(float)
        incoming_link_sets = defaultdict(set)
        incoming_links = defaultdict(lambda: np.array([]))
        damping_factor = 0.85

        # collect the number of outbound links and the set of all incoming documents
        # for every document
        for (from_id, to_id) in links:
            num_outgoing_links[int(from_id)] += 1.0
            incoming_link_sets[to_id].add(int(from_id))

            # convert each set of incoming links into a numpy array
        for doc_id in incoming_link_sets:
            incoming_links[doc_id] = np.array(
                [from_doc_id for from_doc_id in incoming_link_sets[doc_id]])

        num_documents = float(len(num_outgoing_links))
        lead = (1.0 - damping_factor) / num_documents
        partial_PR = np.vectorize(
            lambda doc_id: page_rank[doc_id] / num_outgoing_links[doc_id])

        for _ in xrange(num_iterations):
            for doc_id in num_outgoing_links:
                tail = 0.0
                if len(incoming_links[doc_id]):
                    tail = damping_factor * partial_PR(
                        incoming_links[doc_id]).sum()
                page_rank[doc_id] = lead + tail

        return page_rank

    def insertdatabase(self):
        # insert lexicon, docindex and inverted index into the database
        con = lite.connect(self.db_conn)
        cur = con.cursor()
        cur.execute(
            'CREATE TABLE IF NOT EXISTS lexicon(wordid INTEGER PRIMARY KEY, word text UNIQUE)'
        )
        for item in self._word_id_cache.map:
            word_id = self._word_id_cache.map[item][0]
            try:
                cur.execute('INSERT INTO lexicon VALUES (?, ?)',
                            (word_id, item))
            except lite.IntegrityError as e:
                print "can't insert into db...", e
                if "UNIQUE" in str(e):
                    pass
        cur.execute(
            'CREATE TABLE IF NOT EXISTS docIndex(docid INTEGER PRIMARY KEY, url text UNIQUE)'
        )
        for item in self._doc_id_cache.map:
            doc_id = self._doc_id_cache.map[item][0]
            try:
                cur.execute('INSERT INTO docIndex VALUES (?, ?)',
                            (doc_id, item))
            except lite.IntegrityError as e:
                print "can't insert into db...", e
                if "UNIQUE" in str(e):
                    pass

        con.commit()
        con.close()
        return None
Exemplo n.º 17
0
class EEGDataset1(Dataset):
    """PyTorch dataloader for the imaginary coherence dataloader (dataset 1).
    """

    all_transformations = ["one", "std"]

    def __init__(self,
                 data_folder,
                 file_indices,
                 subj_data,
                 transformation="none",
                 super_node=False):
        """
        Args:
          data_folder : str
            The root folder of the preprocessed data.
          file_indices : Dict[int->int]
            Converts linear indices useful to iterate through the dataset
            into keys for the `subj_data` structure.
          subj_data : dict
            Information about the file location of each sample
        """
        super(EEGDataset1, self).__init__()
        if transformation not in all_transformations:
            raise ValueError(
                f"Transformation must be in {all_transformations}.")

        self.super_node = super_node
        self.transformation = transformation
        self.data_folder = data_folder
        self.num_nodes = 90

        self.xfile_cache = LRUCache(capacity=50)
        self.yfile_cache = LRUCache(capacity=500)
        self.subj_data = subj_data
        self.file_indices = file_indices

    def get_xy_files(self, idx):
        idx = self.file_indices[idx]
        sdata = self.subj_data[idx]
        x_file = os.path.join(self.data_folder, "X", sdata["file"])
        y_file = os.path.join(self.data_folder, "Y", sdata["file"])
        iif = sdata["index_in_file"]

        return x_file, y_file, iif

    def __getitem__(self, idx):
        x_file, y_file, iif = self.get_xy_files(idx)

        X = self.xfile_cache.load(x_file, iif)  # [4095, 50]
        X = self.transform(X)  # [num_freq, 90, 90]
        Y = self.yfile_cache.load(y_file, iif)

        sample = {
            "X": torch.tensor(X, dtype=torch.float32),
            "Y": torch.tensor(Y, dtype=torch.long),
        }
        return sample

    def transform(self, X):
        """
        Args:
         X : numpy array [4095, 50]
        Returns:
         X_transformed : numpy array [num_freq, 90, 90]
        """
        if self.transformation == "std":
            X_delta = np.mean(X[:, 0:4], axis=-1)  # 1 to <4 Hz
            X_theta = np.mean(X[:, 4:8], axis=-1)  # 4 to <8 Hz
            X_alpha = np.mean(X[:, 8:13], axis=-1)  # 8 - <13 Hz
            X_beta = np.mean(X[:, 13:30], axis=-1)  # 13 - <30 Hz
            X_gamma = np.mean(X[:, 30:], axis=-1)  # >=30 Hz
            X_aggregated = np.stack(
                (X_delta, X_theta, X_alpha, X_beta, X_gamma), axis=1)
        elif self.transformation == "one":
            X_aggregated = np.mean(X, axis=-1).expand_dims(1)

        As = []
        for band in range(X.shape[1]):
            A = self.adj_from_tril(X_aggregated[:, band],
                                   num_nodes=self.num_nodes,
                                   super_node=self.super_node)  # 90 x 90
            As.append(A)
        A = np.stack(As, axis=0).astype(np.float32)  # num_freq x 90 x 90
        return A

    def __len__(self):
        return len(self.file_indices)

    @property
    def num_bands(self):
        if self.transformation == "std":
            return 5
        elif self.transformation == "one":
            return 1

    def adj_from_tril(self, one_coh_arr):
        """ builds the A hat matrix of the paper for one sample.
        https://github.com/brainstorm-tools/brainstorm3/blob/master/toolbox/process/functions/process_compress_sym.m shows that
        the flat matrix contains the lower triangular values of the initial symmetric matrix.

        Args:
          one_coh_arr : array [num_nodes*(num_nodes+1)/2]
          super_node : bool (default False)

        Returns:
          A : array [num_nodes, num_nodes]
        """
        # First construct weighted adjacency matrix
        A = np.zeros((self.num_nodes, self.num_nodes))
        index = np.tril_indices(self.num_nodes)
        A[index] = one_coh_arr
        A = (A + A.T)
        if self.super_node:
            A = np.concatenate((A, np.ones((self.num_nodes, 1))),
                               axis=1)  # adding the super node
            A = np.concatenate((A, np.ones((1, self.num_nodes + 1))), axis=0)
        # A tilde from the paper
        di = np.diag_indices(self.num_nodes)
        A[di] = A[di] / 2
        A_tilde = A + np.eye(self.num_nodes)
        # D tilde power -0.5
        D_tilde_inv = np.diag(np.power(np.sum(A_tilde, axis=0), -0.5))
        # Finally build A_hat
        A_hat = np.matmul(D_tilde_inv, np.matmul(A_tilde, D_tilde_inv))
        return A_hat
Exemplo n.º 18
0
 def test_lru_cache_behavior_specific_expiration_with_two_instances(self):
     cache_montreal = LRUCache(capacity=2, connect=False)
     cache_montreal.setRedisConn(self.redis, cache_name='lru')
     cache_bogota = LRUCache(capacity=2, connect=False)
     cache_bogota.setRedisConn(self.redis, cache_name='lru')
     cache_montreal.put('1', '1')
     cache_montreal.put('2', '2', ttl=1)
     self.assertEqual(cache_bogota.get('1'), '1')
     time.sleep(1.1)
     self.assertEqual(cache_bogota.get('2'), -1)
     self.assertEqual(cache_bogota.get('1'), '1')
Exemplo n.º 19
0
class EEGDataset2(Dataset):
    """PyTorch dataloader for the temporal sequence dataset (dataset 2).
    This dataset has 435 identified ROIs, each containing the mean activation
    of sources in the regions at every time-point. The data is organized by
    sleep activity. Each file contains activations for the ROIs at 1500
    time points (at 1Hz?) .

    Note:
      The dataset has very small values (i.e. 1e-10 range). This may cause
      precision errors when using single-precision floating point numbers.
      This class offers two normalization options:
       - standardizing each ROI to 0-mean, unit variance (requires preprocessing
         the whole dataset to extract global statistics)
       - scaling by a large value (NORM_CONSTANT).
    """

    all_normalizations = [
        "standard",  # Standardize each ROI
        "none",  # Multiply all values by NORM_CONSTANT
        "val",  # Indicates that this is a validation loader so normalization is loaded from the tr loader
    ]

    NORM_CONSTANT = 1.0e10

    def __init__(self,
                 data_folder,
                 file_indices,
                 subj_data,
                 normalization="none"):
        """
        Args:
          data_folder : str
            The root folder of the preprocessed data.
          file_indices : Dict[int->int]
            Converts linear indices useful to iterate through the dataset
            into keys for the `subj_data` structure.
          subj_data : dict
            Information about the file location of each sample
          normalization : str
            The type of normalization to use for the data. This can be either
            standard, none or val. val should only be used if this is a
            validation dataset and the statistics are extracted from the
            training set.
        """
        super(EEGDataset2, self).__init__()
        if normalization not in EEGDataset2.all_normalizations:
            raise ValueError(f"Normalization must be in {all_normalizations}.")

        self.normalization = normalization
        self.data_folder = data_folder

        self.xfile_cache = LRUCache(capacity=50)
        self.yfile_cache = LRUCache(capacity=500)
        self.subj_data = subj_data
        self.file_indices = file_indices

        self.init_normalizer()

    def get_xy_files(self, idx):
        idx = self.file_indices[idx]
        sdata = self.subj_data[idx]
        x_file = os.path.join(self.data_folder, "X", sdata["file"])
        y_file = os.path.join(self.data_folder, "Y", sdata["file"])
        iif = sdata["index_in_file"]

        return x_file, y_file, iif

    def __getitem__(self, idx):
        x_file, y_file, iif = self.get_xy_files(idx)

        X = self.xfile_cache.load(x_file, iif)  # [num_nodes, time_steps]
        X = self.normalize(X)
        Y = self.yfile_cache.load(y_file, iif)

        sample = {
            "X": torch.tensor(X, dtype=torch.float32),
            "Y": torch.tensor(Y, dtype=torch.long),
        }
        return sample

    def __len__(self):
        return len(self.file_indices)

    def init_normalizer(self):
        if self.normalization == "val":
            return

        print(
            f"{time_str()} Initializing normalization ({self.normalization}) statistics."
        )
        if self.normalization == "none":
            self.scaler = None
            return

        self.scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
        # Iterate all samples to compute statistics.
        # TODO: This can be optimized to feed the scalers all samples read from a file
        #       but care must be taken to actually only feed it samples whose id is in
        #       the allowed ids.
        for i in range(len(self)):
            x_file, y_file, iif = self.get_xy_files(i)
            X = self.xfile_cache.load(x_file, iif)
            self.scaler.partial_fit(X)

    def normalize(self, data):
        """
        Args:
         - data : array [423, time_steps]

        Returns:
         - norm_data : array [423, time_steps]
        """
        if self.normalization == "val":
            raise ValueError(
                "Normalization cannot be `val`, must be set to a concrete value."
            )

        if self.normalization == "none":
            data = data * EEGDataset2.NORM_CONSTANT
        else:
            data = self.scaler.transform(data)

        return data.astype(np.float32)
""" Global values."""

from cache import LRUCache
from typing import List

# Объявление списка траекторий
TRAJECTORIES: List = []

# Обычный кэш
CACHE = LRUCache(1000)

# Хвостовой кэш
TAIL_CACHE = LRUCache(500)

HANDLERS: List = []

COUNT_EXCHANGED = 0
COUNT_MERGES = 0
COUNT_CREATED_HANDLERS = 0
Exemplo n.º 21
0
 def setUp(self):
     self.cache1 = LRUCache(100)
     self.cache2 = LRUCache(1)
     self.cache3 = LRUCache(2)
     self.cache4 = LRUCache(1)
     self.cache5 = LRUCache(2)
from cache import LRUCache

cache = LRUCache(100)
cache.set('Jesse', 'Pinkman')
cache.set('Walter', 'White')
print(f'Cache contains: {cache.cache}')

cache.set('Jesse', 'James')
print(f'Cache contains: {cache.cache}')

print(f'Get method: {cache.get("Jesse")}')

cache.delete('Walter')
print(f'Cache contains: {cache.cache}')

print(f'Get method: {cache.get("Walter")}')


cache = LRUCache(2)
cache.set('Jesse', 'Pinkman')
print(f'Cache contains: {cache.cache}')
cache.set('Jesse', 's;khgdf')
print(f'Cache contains: {cache.cache}')
cache.set('Walter', 'White')
print(f'Cache contains: {cache.cache}')
cache.set('23', 'unknown')
# print(f'Cache contains: {cache.cache}')
Exemplo n.º 23
0
from cache import LRUCache

cache = LRUCache(1)
cache.set('Jesse', 'Pinkman')
cache.set('Walter', 'White')
cache.set('Jesse', 'James')
print(cache.get('Jesse'))
cache.delete('Walter')
print(cache.get('Walter'))
Exemplo n.º 24
0
 def setUp(self):
     self.c0 = LRUCache()
     self.c1 = LRUCache(5, 3)
Exemplo n.º 25
0
    # tree.inodes['/']['home'].update({'3.jpg': d})

    # print tree

    # serialization
    #f = open('tree', 'wb')
    #f.write(tree.marshal())
    #f.close()

    # deserialization
    # f = open('tree')
    # tree_str = f.read()
    # print tree.unmarshal(tree_str)

    #upload_main_inode(tree.marshal())
    #tree_str = download_from_vk(tree=True)

    #print Tree.unmarshal(tree_str)

    cache = LRUCache(capacity=66)
    cache.set(a.id, (a.size, '/tmp/2.jpg'))
    cache.set(b.id, b)
    cache.set(c.id, c)
    cache.set(c1.id, c1)
    # hit the cache. c should be popped.
    cache.get(a.id)
    cache.get(b.id)
    cache.get(c1.id)
    cache.set(d.id, d)

    cache.get(a.id)
Exemplo n.º 26
0
    def test_lru(self):
        cache = LRUCache(10)
        cache.set("1", 1)
        cache.set("2", 2)
        cache.set("3", 3)
        self.assertEqual(cache.get("2"), 2)

        cache.set("4", 4)
        cache.set("5", 5)
        cache.set("6", 6)
        cache.set("7", 7)
        cache.set("8", 8)
        cache.set("9", 9)
        self.assertEqual(cache.get("8"), 8)

        cache.set("10", 10)
        cache.set("11", 11)
        self.assertEqual(cache.get("1"), None)
        self.assertEqual(cache.get("3"), 3)
Exemplo n.º 27
0
            durs.append(r.duration())
        durs.sort()
        print sum(durs), min(durs), max(durs)
        ress = []
        for i in [5, 10, 20, 40, 60, 80, 90, 95, 99]:
            idx = int(i / 100.0 * len(durs))
            #print idx
            ress.append((i, durs[idx]))
        print "---------percentile-------"
        print ress
        print "-----end----\n\n\n"
        return ress


if __name__ == '__main__':
    cache = LRUCache(2000)
    gen = zipfian_generator(0, 10000, zipfianconstant=0.9)

    srate = 100000.0
    ratio21 = 1000.0
    cache_hit_ratio = 0.65
    arate_max = srate / ratio21 / (1 - cache_hit_ratio)
    arate = arate_max * 0.7

    successor = simpleQueue(srate / ratio21)

    no = 4

    cache.clear()
    successor.clear()
    q1 = Que(cache=cache, generator=gen, successor=successor)
Exemplo n.º 28
0
    def __init__(self, db_conn, url_file):
        """Initialize the crawler with a connection to the database to populate
        and with the file containing the list of seed URLs to begin indexing."""
        self._url_queue = []
        self.db_conn = db_conn
        self._liteMode = 1
        self._memory_cap = 50000
        self._doc_id_cache = LRUCache(self._memory_cap)
        self._word_id_cache = LRUCache(self._memory_cap)
        self._inverted_index = {}
        # Map the doc_id of each webpage to the page title and a short description.
        self._document_index = defaultdict(lambda: ["", ""])

        #for page rank
        self._relation = []
        self._curr_relation = []

        # functions to call when entering and exiting specific tags
        self._enter = defaultdict(lambda *a, **ka: self._visit_ignore)
        self._exit = defaultdict(lambda *a, **ka: self._visit_ignore)

        # add a link to our graph, and indexing info to the related page
        self._enter['a'] = self._visit_a

        # record the currently indexed document's title an increase
        # the font size
        def visit_title(*args, **kargs):
            self._visit_title(*args, **kargs)
            self._increase_font_factor(7)(*args, **kargs)

        # increase the font size when we enter these tags
        self._enter['b'] = self._increase_font_factor(2)
        self._enter['strong'] = self._increase_font_factor(2)
        self._enter['i'] = self._increase_font_factor(1)
        self._enter['em'] = self._increase_font_factor(1)
        self._enter['h1'] = self._increase_font_factor(7)
        self._enter['h2'] = self._increase_font_factor(6)
        self._enter['h3'] = self._increase_font_factor(5)
        self._enter['h4'] = self._increase_font_factor(4)
        self._enter['h5'] = self._increase_font_factor(3)
        self._enter['title'] = visit_title

        # decrease the font size when we exit these tags
        self._exit['b'] = self._increase_font_factor(-2)
        self._exit['strong'] = self._increase_font_factor(-2)
        self._exit['i'] = self._increase_font_factor(-1)
        self._exit['em'] = self._increase_font_factor(-1)
        self._exit['h1'] = self._increase_font_factor(-7)
        self._exit['h2'] = self._increase_font_factor(-6)
        self._exit['h3'] = self._increase_font_factor(-5)
        self._exit['h4'] = self._increase_font_factor(-4)
        self._exit['h5'] = self._increase_font_factor(-3)
        self._exit['title'] = self._increase_font_factor(-7)

        # never go in and parse these tags
        self._ignored_tags = set([
            'meta',
            'script',
            'link',
            'meta',
            'embed',
            'iframe',
            'frame',
            'noscript',
            'object',
            'svg',
            'canvas',
            'applet',
            'frameset',
            'textarea',
            'style',
            'area',
            'map',
            'base',
            'basefont',
            'param',
        ])

        # set of words to ignore
        self._ignored_words = set([
            '',
            'the',
            'of',
            'at',
            'on',
            'in',
            'is',
            'it',
            'a',
            'b',
            'c',
            'd',
            'e',
            'f',
            'g',
            'h',
            'i',
            'j',
            'k',
            'l',
            'm',
            'n',
            'o',
            'p',
            'q',
            'r',
            's',
            't',
            'u',
            'v',
            'w',
            'x',
            'y',
            'z',
            'and',
            'or',
        ])

        # TODO remove me in real version
        self._mock_next_doc_id = 1
        self._mock_next_word_id = 1

        # keep track of some info about the page we are currently parsing
        self._curr_depth = 0
        self._curr_url = ""
        self._curr_doc_id = 0
        self._font_size = 0
        self._curr_words = None

        # get all urls into the queue
        try:
            with open(url_file, 'r') as f:
                for line in f:
                    self._url_queue.append((self._fix_url(line.strip(),
                                                          ""), 0))
        except IOError:
            pass

        # When initializing, by default crawl with a depth of 1.
        self.crawl(depth=1)
Exemplo n.º 29
0
from flask import Flask, request
from flask import jsonify
from flask import Response
import numpy as np
import sqlite3
from sklearn.metrics.pairwise import cosine_similarity
import heapq
from cache import LRUCache

app = Flask(__name__)
db_file = '../db/usersim.sqlite'
cache = LRUCache(capacity=50)
users_cache_size = 100
sql_batch_size = 1000


def send_error(message, status):
    """
    To send an error response to client
    """
    data = {}
    data['error_text'] = message
    data['users'] = []
    response = jsonify(data)
    response.status_code = status
    return response


@app.route('/similarusers/<int:user_handle>')
def get_similar_users(user_handle):
    """
Exemplo n.º 30
0
    def test_default(self):
        cache = LRUCache(2)
        cache.put(1, 1)
        cache.put(2, 2)
        assert cache.get(1) == 1
        # 该操作会使得密钥 2 作废
        cache.put(3, 3)
        assert cache.get(2) == -1

        # 该操作会使得密钥 1 作废
        cache.put(4, 4)
        assert cache.get(1) == -1
        assert cache.get(3) == 3
        assert cache.get(4) == 4
Exemplo n.º 31
0
            self.st_mtime = inode.m_time
            self.st_ctime = self.st_mtime
        else:
            self.st_mode = stat.S_IFDIR | 0755
            self.st_ino = 0
            self.st_dev = 0
            self.st_nlink = 1
            self.st_uid = os.getuid()
            self.st_gid = os.getgid()
            self.st_size = BLOCK_SIZE
            self.st_atime = int(time.time())
            self.st_mtime = self.st_atime
            self.st_ctime = self.st_atime


cache = LRUCache(capacity=CACHE_CAPACITY)


class SNfs(Fuse):
    def __init__(self, *args, **kw):
        Fuse.__init__(self, *args, **kw)
        self.root = '/'
        try:
            os.mkdir(CACHE_DIR)  # create cache dir
        except OSError:  # path already exists
            pass

        tree_str = download_from_vk(tree=True)
        self.tree = Tree.unmarshal(tree_str)
        # self.tree = tree