예제 #1
0
def extract_from_db_to_file_system(min_page_len=500):
    """
    :param min_page_len:
    :return:
    """
    folder = "%s/temp" % config.data_base_path
    if not os.path.exists(folder) or not os.path.isdir(folder):
        os.makedirs(folder)
    p = Page()
    counter = 1
    batch_size = 1000
    offset = 0
    limit = batch_size
    batch_result = p.get_pages(offset, limit, min_page_len)
    while len(batch_result) > 0:
        for r in batch_result:
            file_path = "%s/%s.md" % (folder, counter)
            f = open(file_path, "w")
            f.write(r.page_title.encode("utf8"))
            f.write("\n")
            f.write(r.page_content)
            f.close()
            if counter % 100 == 0:
                print("extract %d th page now" % counter)
            counter += 1
        offset += batch_size
        batch_result = p.get_pages(offset, limit, min_page_len)
예제 #2
0
def extract_from_db_to_file_system(min_page_len=500):
    """
    :param min_page_len:
    :return:
    """
    folder = "%s/temp" % config.data_base_path
    if not os.path.exists(folder) or not os.path.isdir(folder):
        os.makedirs(folder)
    p = Page()
    counter = 1
    batch_size = 1000
    offset = 0
    limit = batch_size
    batch_result = p.get_pages(offset, limit, min_page_len)
    while len(batch_result) > 0:
        for r in batch_result:
            file_path = "%s/%s.md" % (folder, counter)
            f = open(file_path, "w")
            f.write(r.page_title.encode("utf8"))
            f.write("\n")
            f.write(r.page_content)
            f.close()
            if counter % 100 == 0:
                print("extract %d th page now" % counter)
            counter += 1
        offset += batch_size
        batch_result = p.get_pages(offset, limit, min_page_len)
예제 #3
0
 def __init__(self, website, initial_page):
     self.website = website
     self.netloc = urlsplit(website).netloc
     self.pages = {}
     self.queue = set()
     self.initial_page = initial_page
     Page.create_table()
     self.id = -1
예제 #4
0
def many_server(request, tempdir):
    sock = do_zeo_server(request, tempdir, name="many_server", fsname='many.fs')
    db = zerodb.testing.db(None, sock)
    with transaction.manager:
        for i in range(2000):
            db.add(Page(title="hello %s" % i, text="lorem ipsum dolor sit amet" * 2))
        for i in range(1000):
            # Variable length while keeping number of terms the same
            # will cause variable scores
            db.add(Page(title="hello %s" % i, text="this is something we're looking for" * int(i ** 0.5)))
        db.add(Page(title="extra page", text="something else is here"))
    db.disconnect()
    return sock
예제 #5
0
def worker(domain):
    while True:

        # if LINKS_QUEUE.qsize() == 0:
        #     sleep(10)
        #     if LINKS_QUEUE.qsize() == 0:
        #         break
        #     continue

        url = LINKS_QUEUE.get()
        SCANNED_LINKS.add(url)

        try:
            with HTMLSession() as session:
                resp = session.get(url)

            assert resp.status_code == 200

        except Exception as e:
            print(e, type(e))
            continue

        try:
            page_title = resp.html.xpath('//title')[0].text
        except IndexError:
            page_title = 'Not Found'

        try:
            page_h1 = resp.html.xpath('//h1')[0].text
        except IndexError:
            page_h1 = 'Not Found'

        Page.create(url=url, title=page_title, h1=page_h1)
        print('[OK]', url)

        with locker:
            with open('results.csv', 'a') as f:
                f.write(f'{url}\t{page_title}\t{page_h1}\n')

        for link in resp.html.absolute_links:
            link = link.split('#')[0]
            if domain not in link:
                continue
            if link in SCANNED_LINKS:
                continue
            if any(part in link for part in BAD_PARTS):
                continue

            LINKS_QUEUE.put(link)
예제 #6
0
    def generate_adjacency_matrix(self, drop_static=False):
        pages = [page for page in Page.select() if not drop_static or "html" in page.content_type]
        ids = {}
        matrix = {}

        for page in pages:
            if drop_static and "text/html" not in page.content_type:
                continue
            ids[page.url] = int(page.id)
            matrix[page.id] = set()

        for page in pages:
            if drop_static and "text/html" not in page.content_type:
                continue
            for link in json.loads(page.links):
                if drop_static and link not in ids:
                    continue
                if ids[link] not in matrix[page.id]:
                    matrix[page.id].add(ids[link])

        for el in matrix:
            matrix[el] = list(matrix[el])

        with open("data\\matrix.json", "w") as w:
            w.write(json.dumps(matrix))
예제 #7
0
def new_book_page(bid, p_name, p_content):
    nbpage = Page(page_name=p_name,
                  page_content=p_content,
                  created=datetime.now(),
                  updated=datetime.now(),
                  book_id=bid)
    db_session.add(nbpage)
    db_session.commit()
예제 #8
0
파일: process.py 프로젝트: chagge/WikiQA-1
def extract_from_db_to_file_system(number_limit = -1, min_page_len = 0):
    """
    :param number_limit: the number limit , if value equal -1, it means no limit
    :param min_page_len:
    :return:
    """
    folder = "temp"
    if not os.path.isdir(folder):
        os.makedirs(folder)
    p = Page()
    counter = 1741794
    for r in p.get_pages(number_limit,min_page_len):
        print (counter -1741794) *1.0/670471
        file_path = folder + "/" + str(counter)+".md"
        f = open(file_path,"w")
        f.write(r.page_title.encode("utf8"))
        f.write("\n")
        f.write(r.page_content)
        f.close()
        counter += 1
예제 #9
0
def test_add(db):
    with transaction.manager:
        pre_commit_count = db._storage._debug_download_count
        page = Page(title="hello", text="Quick brown lazy fox jumps over lorem  ipsum dolor sit amet")
        db.add(page)
        post_commit_count = db._storage._debug_download_count
    print("Number of requests:", post_commit_count - pre_commit_count)
    assert post_commit_count - pre_commit_count < 22

    with transaction.manager:
        db.remove(page)
예제 #10
0
파일: test_db.py 프로젝트: izogain/zerodb
def test_reindex(db):
    with transaction.manager:
        page = Page(
            title="hello",
            text="Quick0 brown lazy fox jumps over lorem  ipsum dolor sit amet"
        )
        docid = db.add(page)
    assert len(db[Page].query(Contains("text", "quick0"))) == 1

    # DbModel, by ID
    with transaction.manager:
        page.text = "Quick1 brown lazy fox jumps over well, you know"
        db[Page].reindex(docid)
    assert len(db[Page].query(Contains("text", "quick0"))) == 0
    assert len(db[Page].query(Contains("text", "quick1"))) == 1

    # DbModel, by obj
    with transaction.manager:
        page.text = "quick2 brown lazy fox jumps over well, you know"
        db[Page].reindex(page)
    assert len(db[Page].query(Contains("text", "quick1"))) == 0
    assert len(db[Page].query(Contains("text", "quick2"))) == 1

    # DB, by obj
    with transaction.manager:
        page.text = "quick3 brown lazy fox jumps over well, you know"
        db[Page].reindex(page)
    assert len(db[Page].query(Contains("text", "quick2"))) == 0
    assert len(db[Page].query(Contains("text", "quick3"))) == 1

    # DB, multiple objects
    with transaction.manager:
        page2 = Page(
            title="hello",
            text="Quick4 brown lazy fox jumps over lorem  ipsum dolor sit amet"
        )
        db.add(page2)

    with transaction.manager:
        page.text = "quick5 brown lazy fox jumps over well, you know"
        page2.text = "quick5 brown lazy fox jumps over well, you know"
        db.reindex([page, page2])
    assert len(db[Page].query(
        Contains("text", "quick3") | Contains("text", "quick4"))) == 0
    assert len(db[Page].query(Contains("text", "quick5"))) == 2
예제 #11
0
    def load(self):
        max_id = 1
        visited_links = set()
        all_links = set()

        for page in Page.select():
            if page.id > max_id:
                max_id = page.id
            visited_links.add(page.url)
            for link in json.loads(page.links):
                all_links.add(link)

        self.pages = {l: None for l in visited_links}
        self.queue = all_links - visited_links
        return max_id
예제 #12
0
    def get_page(self, url):
        normalized_url = self.normalize(url)
        if normalized_url not in self.pages:
            headers = requests.head(url)
            content_type = headers.headers.get('content-type', '')

            if "text/html" in content_type:
                try:
                    page = requests.get(url)
                except Exception as e:
                    logging.error(f"Requests get exception: {e}")
                    Page.create(id=self.id,
                                url=normalized_url,
                                status=headers.status_code,
                                content_type=content_type,
                                links=json.dumps([]))
                    self.id += 1
                    return

                logging.debug(f"Got {url} [{page.status_code}]")

                try:
                    page_content = get_page_source(url)
                except Exception as e:
                    logging.error(f"Got selenium error: [{e}]")
                    page_content = page.content

                links = [
                    self.normalize(link)
                    for link in self.parse_page(page_content)
                ]
                Page.create(id=self.id,
                            url=normalized_url,
                            status=page.status_code,
                            content_type=content_type,
                            links=json.dumps(links))
                self.pages[normalized_url] = None
                for link in links:
                    if link not in self.pages:
                        self.queue.add(link)
            else:
                logging.debug(f"Add {url} with content_type: {content_type}")
                Page.create(id=self.id,
                            url=normalized_url,
                            status=headers.status_code,
                            content_type=content_type,
                            links=json.dumps({}))
            self.id += 1
예제 #13
0
def test_all_uid(db):
    # Test for https://gist.github.com/micxjo/a097698b33fc4669b0b4
    page = Page(title="Test page", text="Hello world")
    with transaction.manager:
        db.add(page)

    del page
    # Clear in-memory and on-disk caches
    db._storage._cache.clear()
    db._connection._cache.full_sweep()

    for item in db[Page].all():
        assert hasattr(item, "_p_uid")
        del item

    db._storage._cache.clear()
    db._connection._cache.full_sweep()

    for uid in db[Page].all_uids():
        obj = db[Page][uid]
        assert hasattr(obj, "_p_uid")
        del obj

    db._storage._cache.clear()
    db._connection._cache.full_sweep()

    uids = list(islice(db[Page].all_uids(), 10))
    objs = db[Page][uids]
    for obj in objs:
        assert hasattr(obj, "_p_uid")

    objs = list(db[Page].all())
    obj = objs[0]
    obj._p_activate()
    assert hasattr(obj, "_p_uid")

    objs[1].text += " xxx"
    transaction.commit()
    for obj in objs:
        assert hasattr(obj, "_p_uid")
예제 #14
0
파일: test_db.py 프로젝트: davinirjr/zerodb
def test_reindex(db):
    with transaction.manager:
        page = Page(title="hello", text="Quick0 brown lazy fox jumps over lorem  ipsum dolor sit amet")
        docid = db.add(page)
    assert len(db[Page].query(Contains("text", "quick0"))) == 1

    # DbModel, by ID
    with transaction.manager:
        page.text = "Quick1 brown lazy fox jumps over well, you know"
        db[Page].reindex(docid)
    assert len(db[Page].query(Contains("text", "quick0"))) == 0
    assert len(db[Page].query(Contains("text", "quick1"))) == 1

    # DbModel, by obj
    with transaction.manager:
        page.text = "quick2 brown lazy fox jumps over well, you know"
        db[Page].reindex(page)
    assert len(db[Page].query(Contains("text", "quick1"))) == 0
    assert len(db[Page].query(Contains("text", "quick2"))) == 1

    # DB, by obj
    with transaction.manager:
        page.text = "quick3 brown lazy fox jumps over well, you know"
        db[Page].reindex(page)
    assert len(db[Page].query(Contains("text", "quick2"))) == 0
    assert len(db[Page].query(Contains("text", "quick3"))) == 1

    # DB, multiple objects
    with transaction.manager:
        page2 = Page(title="hello", text="Quick4 brown lazy fox jumps over lorem  ipsum dolor sit amet")
        db.add(page2)

    with transaction.manager:
        page.text = "quick5 brown lazy fox jumps over well, you know"
        page2.text = "quick5 brown lazy fox jumps over well, you know"
        db.reindex([page, page2])
    assert len(db[Page].query(Contains("text", "quick3") | Contains("text", "quick4"))) == 0
    assert len(db[Page].query(Contains("text", "quick5"))) == 2
예제 #15
0
        # page info
        page_number = 1
        for page_link in driver.find_elements_by_css_selector(
            "div#content div.prevws a"
        ):
            page_url = page_link.get_attribute("href")
            page = session.query(Page).filter(Page.url == page_url).first()
            page_text = get_page_text(
                journal_title, issue_date, issue_text, page_number
            )
            try:
                page_link.find_element_by_class_name("treffer")
                hit = True
            except NoSuchElementException:
                hit = False
            if page:
                if args.update:
                    page.text = page_text
            else:
                page = Page(issue.issue_id, page_number, page_text, hit, page_url)
            session.add(page)
            logger.debug(
                f"Page info extracted. Number: {page_number}, page url: {page_url} and page text: {page_text[:10] if page_text else None}"
            )
            page_number += 1
        session.commit()

    session.close()
    driver.quit()
    logger.info(f"Completed. Processing took {(datetime.now() - t1).seconds}s.")
예제 #16
0
def test_auto_reindex(db):
    with transaction.manager:
        page = Page(title="hello", text="autoreindex0, test whether to work")
        db.add(page)
    assert len(db[Page].query(Contains("text", "autoreindex0"))) == 1

    with transaction.manager:
        page.text = "autoreindex1, test whether to work"
    assert len(db[Page].query(Contains("text", "autoreindex0"))) == 0
    assert len(db[Page].query(Contains("text", "autoreindex1"))) == 1

    with transaction.manager:
        page2 = Page(title="hello", text="autoreindex2, test whether to work")
        db.add(page2)

    with transaction.manager:
        page.text = "autoreindex3, test whether to work"
        page2.text = "autoreindex3, test whether to work"
    assert len(db[Page].query(Contains("text", "autoreindex1") | Contains("text", "autoreindex2"))) == 0
    assert len(db[Page].query(Contains("text", "autoreindex3"))) == 2

    with mock.patch("zerodb.db.DbModel.reindex_one") as reindex_mock:
        with transaction.manager:
            page.text = "autoreindex3, test whether to work1"
            page.text = "autoreindex3, test whether to work2"
            page.text = "autoreindex3, test whether to work3"
        assert reindex_mock.call_count == 1

    db.enableAutoReindex(False)
    with transaction.manager:
        page.text = "autoreindex4, test whether to work"
    assert len(db[Page].query(Contains("text", "autoreindex3"))) == 2
    assert len(db[Page].query(Contains("text", "autoreindex4"))) == 0

    db.enableAutoReindex(True)
    with transaction.manager:    # should not throw ModleException
        page3 = Page(title="helloworld", text="autoreindex5, test whether to work")
        page3.title = "helloworld1"
    assert len(db[Page].query(Eq("title", "helloworld"))) == 0
    assert len(db[Page].query(Eq("title", "helloworld1"))) == 0

    with mock.patch("zerodb.db.DbModel.reindex_one") as reindex_mock:
        with transaction.manager:  # should not reindex
            page3 = Page(title="helloworld", text="autoreindex5, test whether to work")
            page3.title = "helloworld1"
            db.add(page3)
        assert reindex_mock.call_count == 0

    with transaction.manager:  # should  reindex
        page3 = Page(title="helloworld", text="autoreindex6, test whether to work")
        db.add(page3)
        page3.title = "helloworld1"
        page3.text = "autoreindex7, test whether to work"
    assert len(db[Page].query(Eq("title", "helloworld"))) == 0
    assert len(db[Page].query(Eq("title", "helloworld1"))) == 2
    assert len(db[Page].query(Contains("text", "autoreindex6"))) == 0
    assert len(db[Page].query(Contains("text", "autoreindex7"))) == 1

    with mock.patch("zerodb.db.DbModel.reindex_one") as reindex_mock:
        with transaction.manager:  # should  reindex
            page3 = Page(title="helloworld", text="autoreindex6, test whether to work")
            db.add(page3)
            page3.title = "helloworld1"
            page3.text = "autoreindex7, test whether to work"
        assert reindex_mock.call_count == 1