예제 #1
0
 def extractLinks(url, depth, maxDepth):
     if Utils.isValidURL(url) and url not in readLinks:
         readLinks.append(url)
         webInfo = Crawler.getInfoFromOneWeb(url)
         print('===== Working on: ' + url + ' --- depth: ' + str(depth) +
               ' --- ' + str(len(webInfo['links'])) + ' links were found')
         p = Page(url, '', depth, url, webInfo['links'], [], [])
         dc.addData('collectionoftest', p.returnLikeObject())
         depth = depth + 1
         if depth < maxDepth:
             for newUrl in webInfo['links']:
                 MainCrawler.extractLinks(newUrl, depth, maxDepth)
         else:
             return
예제 #2
0
파일: pump.py 프로젝트: michelp/wikiparser
def wikit():
    num = 0
    if len(sys.argv) > 1:
        db = xodb.open('test_et4', writable=False)
        last = db.backend.get_doccount()
    with bz2.BZ2File('enwiki-latest-pages-articles.xml.bz2') as f:
        current = None
        batch = []
        for event, element in etree.iterparse(f, events=('start', 'end')):
            if event == 'start':
                if element.tag.endswith('page'):
                    current = Page()
                    redirect = False
                if element.tag.endswith('title'):
                    current.title = element.text
                if element.tag.endswith('text'):
                    if element.text:
                        current.text = element.text
                if element.tag.endswith('redirect'):
                    redirect = True
            if event == 'end':
                if element.tag.endswith('page'):
                    # num += 1
                    # if num < last:
                    #     print 'Skipping  ', num, " ", last, " ", current.title
                    #     element.clear()
                    #     continue
                    if current.text and current.title and not redirect:
                        if not current.title.startswith(
                            ('Template:', 'Category:', 'File:')):
                            print "Pumping ", current.title
                            try:
                                batch.append(current)
                                if len(batch) > BATCH_SIZE:
                                    source.send(dumps(batch))
                                    batch = []
                            except Exception:
                                log.exception('wtf')
                    element.clear()
예제 #3
0
파일: pump.py 프로젝트: michelp/wikiparser
def wikit():
    num = 0
    if len(sys.argv) > 1:
        db = xodb.open('test_et4', writable=False)
        last = db.backend.get_doccount()
    with bz2.BZ2File('enwiki-latest-pages-articles.xml.bz2') as f:
        current = None
        batch = []
        for event, element in etree.iterparse(f, events=('start', 'end')):
            if event == 'start':
                if element.tag.endswith('page'):
                    current = Page()
                    redirect = False
                if element.tag.endswith('title'):
                    current.title = element.text
                if element.tag.endswith('text'):
                    if element.text:
                        current.text = element.text
                if element.tag.endswith('redirect'):
                    redirect = True
            if event == 'end':
                if element.tag.endswith('page'):
                    # num += 1
                    # if num < last:
                    #     print 'Skipping  ', num, " ", last, " ", current.title
                    #     element.clear()
                    #     continue
                    if current.text and current.title and not redirect:
                        if not current.title.startswith(('Template:', 'Category:', 'File:')):
                            print "Pumping ", current.title
                            try:
                                batch.append(current)
                                if len(batch) > BATCH_SIZE:
                                    source.send(dumps(batch))
                                    batch = []
                            except Exception:
                                log.exception('wtf')
                    element.clear()
예제 #4
0
from lxml import etree

from schemas import Page, PageSchema

log = logging.getLogger(__name__)
db = xodb.open('/home/michel/xap/ms_test2')
db.map(Page, PageSchema)

redirects = {}

with bz2.BZ2File('enwiki-latest-pages-articles.xml.bz2') as f:
    current = None
    for event, element in etree.iterparse(f, events=('start', 'end')):
        if event == 'start':
            if element.tag.endswith('page'):
                current = Page()
                redirect = False
            if element.tag.endswith('title'):
                current.title = element.text
            if element.tag.endswith('text'):
                if element.text:
                    current.text = element.text
            if element.tag.endswith('redirect'):
                redirect = True
        if event == 'end':
            if element.tag.endswith('page'):
                if current.text and current.title and not redirect:
                    try:
                        db.add(current)
                    except Exception:
                        log.exception('wtf')
예제 #5
0

log = logging.getLogger(__name__)
db = xodb.open('/home/michel/xap/ms_test2')
db.map(Page, PageSchema)


redirects = {}


with bz2.BZ2File('enwiki-latest-pages-articles.xml.bz2') as f:
    current = None
    for event, element in etree.iterparse(f, events=('start', 'end')):
        if event == 'start':
            if element.tag.endswith('page'):
                current = Page()
                redirect = False
            if element.tag.endswith('title'):
                current.title = element.text
            if element.tag.endswith('text'):
                if element.text:
                    current.text = element.text
            if element.tag.endswith('redirect'):
                redirect = True
        if event == 'end':
            if element.tag.endswith('page'):
                if current.text and current.title and not redirect:
                    try:
                        db.add(current)
                    except Exception:
                        log.exception('wtf')