예제 #1
0
    def doOneTask(self, task):
        #0. sign it 
        self.signTask(task)     
        task.status = 'parsing'
        #1. call handler 
        if task.handler not in self.hs: 
            #from MainPageHandler import MainPageHandler
            m = __import__(task.handler)
            c = getattr(m, task.handler)
            self.hs[task.handler] = c()
        h = self.hs[task.handler]

        #2. check type and do it
        isOK = True
        if task.taskType in ('media', 'page'): 
            #deal with it
            output = {}
            try:
                output = h.parse(task)
            except Exception, e:
                # todo  set task failed here 
                util.printException()
                myLogger.error("sth wrong [%s][%s] in parsing, set task[%d] failed" % (task.handler, e, task.id))
                isOK = False
            if 'newTasks' in output:
                for t in output['newTasks']:
                    # get a taskId from my manager
                    self.m.packTask(t)
                    self.signTask(t)
                    self.m.addTask(t)
예제 #2
0
 def run(self):
     myLogger.info("worker %s begin to run" % (self.name))
     myLogger.info("my manager's inQueue size [%d]" % (self.inQueue.qsize()))
     self.hungerly = False
     while True:
         try:
             # get 
             get = None
             try:
                 get = self.inQueue.get(timeout=1)
                 self.hungerly = False
             except Queue.Empty, e:
                 self.hungerly = True
                 myLogger.debug('thread [%s] is hungerly now' % (self.name))
                 if self.m.shouldExit == True:
                     myLogger.debug('thread [%s] is exiting' % (self.name))
                     break
             if self.hungerly == False:
                 # do
                 myLogger.info("thread [%s], get task[%s] from queue" % (self.name, get.id))
                 time.sleep(3)
                 # put
                 self.doOneTask(get)
             else:
                 time.sleep(2)
         except Exception, e:
             util.printException()
             myLogger.error('sth wrong[%s] in thread [%s]' % (e, self.name))
             get.status =  'failed'
             if get.msg !=  '':
                 get.msg = util.exprException()
예제 #3
0
    def download(self, url, to, postdata={}, timeout=60, times=3, safeFetch=True):
        myLogger.info("fetcher download from [%s] to [%s]" % (url, to))
        ret = None
        while times!=0:
            status=''
            if safeFetch == True:
                resp = self.safeFetch(url, postdata, timeout)
            else:
                resp, status = self.fetch(url, postdata, timeout)

            if util.mkdir(os.path.dirname(to)) == False:
                times -= 1
                ret = None
                continue

            if status == 'OK' or (status == '' and resp != None): # fetch ok
                CHUNK = 1024 * 1024 * 5
                with open(to, 'wb') as f:
                    ret = True
                    try:
                        while True:
                            chunk = resp.read(CHUNK)
                            if not chunk: 
                                break
                            f.write(chunk)
                    except Exception, e:
                        msg = util.exprException()
                        util.printException()
                        ret = None

                if ret == True:
                    break
            times -= 1
예제 #4
0
 def run(self):
     myLogger.info("worker %s begin to run" % (self.name))
     myLogger.info("my manager's inQueue size [%d]" %
                   (self.inQueue.qsize()))
     self.hungerly = False
     while True:
         try:
             # get
             get = None
             try:
                 get = self.inQueue.get(timeout=1)
                 self.hungerly = False
             except Queue.Empty, e:
                 self.hungerly = True
                 myLogger.debug('thread [%s] is hungerly now' % (self.name))
                 if self.m.shouldExit == True:
                     myLogger.debug('thread [%s] is exiting' % (self.name))
                     break
             if self.hungerly == False:
                 # do
                 myLogger.info("thread [%s], get task[%s] from queue" %
                               (self.name, get.id))
                 time.sleep(3)
                 # put
                 self.doOneTask(get)
             else:
                 time.sleep(2)
         except Exception, e:
             util.printException()
             myLogger.error('sth wrong[%s] in thread [%s]' % (e, self.name))
             get.status = 'failed'
             if get.msg != '':
                 get.msg = util.exprException()
예제 #5
0
    def doOneTask(self, task):
        #0. sign it
        self.signTask(task)
        task.status = 'parsing'
        #1. call handler
        if task.handler not in self.hs:
            #from MainPageHandler import MainPageHandler
            m = __import__(task.handler)
            c = getattr(m, task.handler)
            self.hs[task.handler] = c()
        h = self.hs[task.handler]

        #2. check type and do it
        isOK = True
        if task.taskType in ('media', 'page'):
            #deal with it
            output = {}
            try:
                output = h.parse(task)
            except Exception, e:
                # todo  set task failed here
                util.printException()
                myLogger.error(
                    "sth wrong [%s][%s] in parsing, set task[%d] failed" %
                    (task.handler, e, task.id))
                isOK = False
            if 'newTasks' in output:
                for t in output['newTasks']:
                    # get a taskId from my manager
                    self.m.packTask(t)
                    self.signTask(t)
                    self.m.addTask(t)
예제 #6
0
    def parseContent(self, page):
        page = page.decode('gbk')
        ret = {'stockList': [], 'nextPage': ''}
        try:
            soup = BS(page)
            theA = soup.find('a', text='下一页')
            if theA != None and theA['href'] != '':
                ret['nextPage'] = theA['href']

            table = soup.find('table',
                              width="100%",
                              cellspacing="1",
                              cellpadding="2",
                              border="0",
                              bgcolor="#337fb2")
            trList = table.find_all('tr')
            for tr in trList[1:]:
                tdList = tr.find_all('td')
                code = tdList[0].text
                name = tdList[1].text
                href = tdList[0].a['href']
                ret['stockList'].append({
                    'code': code,
                    'name': name,
                    'href': href
                })
        except Exception, e:
            util.printException()
            return (None, e)
예제 #7
0
def ensureSIFTFeatures(filepath,
                       paramsSIFT,
                       properties,
                       csvDir,
                       validateByFileExists=False):
    """
     filepath: to the image from which SIFT features have been or have to be extracted.
     params: dict of registration parameters, including the key "scale".
     paramsSIFT: FloatArray2DSIFT.Params instance.
     csvDir: directory into which serialized features have been or will be saved.
     load: function to load an image as an ImageProcessor from the filepath.
     validateByFileExists: whether to merely check that the .obj file exists as a quick form of validation.
     
     First check if serialized features exist for the image, and if the Params match.
     Otherwise extract the features and store them serialized.
     Returns the ArrayList of Feature instances.
  """
    path = os.path.join(csvDir,
                        os.path.basename(filepath) + ".SIFT-features.obj")
    if validateByFileExists:
        if os.path.exists(path):
            return True
    # An ArrayList whose last element is a mpicbg.imagefeatures.FloatArray2DSIFT.Param
    # and all other elements are mpicbg.imagefeatures.Feature
    features = deserialize(path) if os.path.exists(path) else None
    if features:
        if features.get(features.size() - 1).equals(paramsSIFT):
            features.remove(features.size() - 1)  # removes the Params
            syncPrintQ("Loaded %i SIFT features for %s" %
                       (features.size(), os.path.basename(filepath)))
            return features
        else:
            # Remove the file: paramsSIFT have changed
            os.remove(path)
    # Else, extract de novo:
    try:
        # Extract features
        imp = loadImp(filepath)
        ip = imp.getProcessor()
        paramsSIFT = paramsSIFT.clone()
        ijSIFT = SIFT(FloatArray2DSIFT(paramsSIFT))
        features = ArrayList()  # of Feature instances
        ijSIFT.extractFeatures(ip, features)
        ip = None
        imp.flush()
        imp = None
        features.add(
            paramsSIFT
        )  # append Params instance at the end for future validation
        serialize(features, path)
        features.remove(features.size() -
                        1)  # to return without the Params for immediate use
        syncPrintQ("Extracted %i SIFT features for %s" %
                   (features.size(), os.path.basename(filepath)))
    except:
        printException()
    return features
 def parseContent(self, page):
     ret = {'excelUrl':''}
     try:
         soup = BS(page)           
         td = soup.find('td', align="right", width="60px", valign="bottom") 
         print td
         print td.a['href']
     except Exception, e:
         util.printException()
         return (None, e)
예제 #9
0
 def parseContent(self, page):
     ret = {}
     soup = BS(page)
     try:
         li = soup.find('li', 'downloadlinkstatic')
         src = li.a.get('href')
         ret['contentMp3'] = src
     except Exception, e:
         util.printException()
         return (None, e)
예제 #10
0
 def parseContent(self, page):
     ret = {'excelUrl': ''}
     try:
         soup = BS(page)
         td = soup.find('td', align="right", width="60px", valign="bottom")
         print td
         print td.a['href']
     except Exception, e:
         util.printException()
         return (None, e)
예제 #11
0
 def parseContent(self, page):
     ret = {}
     soup = BS(page)
     try:
         li = soup.find('li', 'downloadlinkstatic') 
         src = li.a.get('href')
         ret['contentMp3'] = src
     except Exception, e:
         util.printException()
         return (None, e)
예제 #12
0
 def parseContent(self, page):
     ret = []
     page = page.decode('utf-8')
     try:
         soup = BS(page)
         tableDiv = soup.find('div', class_='genTable')
         trList = tableDiv.find_all('tr')
         for tr in trList:
             ret.append(tr.text)
     except Exception, e:
         util.printException()
         return (None, e)
예제 #13
0
 def parseContent(self, page):
     ret = []
     page = page.decode('utf-8')
     try:
         soup = BS(page)
         tableDiv = soup.find('div', class_='genTable')
         trList = tableDiv.find_all('tr')
         for tr in trList:
             ret.append(tr.text)   
     except Exception, e:
         util.printException()
         return (None, e)
예제 #14
0
    def parseContent(self, page):
        ret = []
        try:
            soup = BS(page)           
            headerDiv = soup.find(id='indexItems')  
            headlis = headerDiv.select('li')
            for li in headlis:
                iUrl = urlparse.urljoin(li.a['href'].rstrip(".html")+"/", "pc10.html?tab=None")
                #iUrl = urlparse.urljoin(li.a['href'].rstrip(".html")+"/", "pc0.html?tab=None")
                ret.append(iUrl)

        except Exception, e:
            util.printException()
            return (None, e)
 def parseContent(self, page):
     ret = {}
     try:
         soup = BS(page.decode('gbk'))
         table = soup.find('table', width="92%", cellspacing="1", cellpadding="2", border="0") 
         trList = table.find_all('tr')
         for tr in trList:
             tdList = tr.find_all('td')
             if len(tdList) <= 1:
                 continue
             k = tdList[0].text.strip().split(":")[0]
             v = tdList[1].text.strip()
             ret[k] = v
     except Exception, e:
         util.printException()
         return (None, e)
예제 #16
0
    def parseContent(self, page):
        ret = []
        try:
            soup = BS(page)           
            headerDiv = soup.find(id='header')  
            headlis = headerDiv.select('li.header_navigation_item.has_child')
            for li in headlis:
                if li.a.text == 'Audio':
                    links = li.find_all('a', 'section_link')
                    break
            for link in links:
                ret.append(link['href'])

        except Exception, e:
            util.printException()
            return (None, e)
예제 #17
0
 def parseContent(self, page):
     ret = {'stockInfoPage':{}, 'stockList':[]}
     try:
         soup = BS(page)           
         trList = soup.find_all('tr',  class_ ='tr_normal' ) 
         for tr in trList:
             tdList = tr.find_all('td')
             href = ''
             if hasattr(tdList[1], 'a') and tdList[1].a != None and tdList[1].a != '':
                 href = tdList[1].a['href']
             textList = [ td.text for td in  tdList ]
             textList.append(href)
             ret['stockList'].append(textList)
             ret['stockInfoPage'][textList[0]] = href
     except Exception, e:
         util.printException()
         return (None, e)
예제 #18
0
 def parseContent(self, page):
     ret = {"zipPic": {}, "contentPage": {}}
     try:
         soup = BS(page)
         articleUl = soup.find(id="articleItems")
         # ul = soup.find('ul', "bullet_orange")
         divs = articleUl.find_all("div", "media-block")
         for div in divs:
             zipPic = div.img.get("src")
             # has audio?
             a = div.find("a", "img-wrapper")
             if a and a["href"] != "":
                 url = a.get("href")
                 key = url.replace("/", "_")
                 ret["zipPic"][key] = zipPic
                 ret["contentPage"][key] = url
     except Exception, e:
         util.printException()
         return (None, e)
예제 #19
0
    def parseContent(self, page):
        page = page.decode('gbk')
        ret = {'stockList':[], 'nextPage':''}
        try:
            soup = BS(page)
            theA = soup.find('a', text='下一页')
            if theA != None and theA['href'] != '':
                ret['nextPage'] = theA['href']

            table = soup.find('table', width="100%", cellspacing="1", cellpadding="2", border="0", bgcolor="#337fb2") 
            trList = table.find_all('tr')
            for tr in trList[1:]:
                tdList = tr.find_all('td')             
                code = tdList[0].text
                name = tdList[1].text
                href = tdList[0].a['href']
                ret['stockList'].append({'code': code, 'name': name, 'href': href})
        except Exception, e:
            util.printException()
            return (None, e)
예제 #20
0
 def parseContent(self, page):
     ret = {}
     try:
         soup = BS(page.decode('gbk'))
         table = soup.find('table',
                           width="92%",
                           cellspacing="1",
                           cellpadding="2",
                           border="0")
         trList = table.find_all('tr')
         for tr in trList:
             tdList = tr.find_all('td')
             if len(tdList) <= 1:
                 continue
             k = tdList[0].text.strip().split(":")[0]
             v = tdList[1].text.strip()
             ret[k] = v
     except Exception, e:
         util.printException()
         return (None, e)
예제 #21
0
def extractSIFTMatches(filepath1, filepath2, params, paramsSIFT, properties,
                       csvDir):
    # Skip if pointmatches CSV file exists already:
    csvpath = os.path.join(
        csvDir,
        basename(filepath1) + '.' + basename(filepath2) + ".pointmatches.csv")
    if os.path.exists(csvpath):
        return False

    try:
        # Load from CSV files or extract features de novo
        features1 = ensureSIFTFeatures(filepath1, paramsSIFT, properties,
                                       csvDir)
        features2 = ensureSIFTFeatures(filepath2, paramsSIFT, properties,
                                       csvDir)
        #syncPrintQ("Loaded %i features for %s\n       %i features for %s" % (features1.size(), os.path.basename(filepath1),
        #                                                                     features2.size(), os.path.basename(filepath2)))
        # Vector of PointMatch instances
        sourceMatches = FloatArray2DSIFT.createMatches(
            features1,
            features2,
            params.get(
                "max_sd",
                1.5),  # max_sd: maximal difference in size (ratio max/min)
            TranslationModel2D(),
            params.get(
                "max_id",
                Double.MAX_VALUE),  # max_id: maximal distance in image space
            params.get("rod", 0.9))  # rod: ratio of best vs second best

        syncPrintQ("Found %i SIFT pointmatches for %s vs %s" %
                   (sourceMatches.size(), os.path.basename(filepath1),
                    os.path.basename(filepath2)))

        # Store pointmatches
        savePointMatches(os.path.basename(filepath1),
                         os.path.basename(filepath2), sourceMatches, csvDir,
                         params)
        return True
    except:
        printException()
예제 #22
0
def loadFeatures(img_filename,
                 directory,
                 params,
                 validateOnly=False,
                 epsilon=0.00001,
                 verbose=True):
    """ Attempts to load features from filename + ".features.csv" if it exists,
      returning a list of Constellation features or None.
      params: dictionary of parameters with which features are wanted now,
              to compare with parameter with which features were extracted.
              In case of mismatch, return None.
      epsilon: allowed error when comparing floating-point values.
      validateOnly: if True, return after checking that parameters match. """
    try:
        csvpath = os.path.join(directory,
                               basename(img_filename) + ".features.csv")
        if os.path.exists(csvpath):
            with open(csvpath, 'r') as csvfile:
                reader = csv.reader(csvfile, delimiter=',', quotechar='"')
                # First line contains parameter names, second line their values
                if not checkParams(params, reader.next(), reader.next(),
                                   epsilon):
                    return None
                if validateOnly:
                    return True  # would return None above, which is falsy
                reader.next()  # skip header with column names
                features = [
                    Constellation.fromRow(map(float, row)) for row in reader
                ]
                if verbose:
                    syncPrint("Loaded %i features for %s" %
                              (len(features), img_filename))
                return features
        else:
            if verbose:
                syncPrint("No stored features found at %s" % csvpath)
            return None
    except:
        syncPrint("Could not load features for %s" % img_filename)
        printException()
        return None
예제 #23
0
 def parseContent(self, page):
     ret = {'zipPic':{}, 'contentPage':{}}
     try:
         soup = BS(page)           
         titleDiv = soup.find(id='archive').h2  
         classTitle = titleDiv.text
         ul = soup.find('ul', "bullet_orange")
         divs = ul.find_all('div', 'archive_rowmm')
         for div in divs:
             zipPic = div.img.get('src')
             #has audio?
             a = div.h4.a
             if a.select('span.assignedIcon.asIcoAudio') != []:
                 span = a.find('span', 'underlineLink')
                 url = a.get('href')
                 key = url.replace('/', '_')
                 ret['zipPic'][key] = zipPic
                 ret['contentPage'][key] = url
     except Exception, e:
         util.printException()
         return (None, e)
예제 #24
0
 def parseContent(self, page):
     ret = {'zipPic': {}, 'contentPage': {}}
     try:
         soup = BS(page)
         titleDiv = soup.find(id='archive').h2
         classTitle = titleDiv.text
         ul = soup.find('ul', "bullet_orange")
         divs = ul.find_all('div', 'archive_rowmm')
         for div in divs:
             zipPic = div.img.get('src')
             #has audio?
             a = div.h4.a
             if a.select('span.assignedIcon.asIcoAudio') != []:
                 span = a.find('span', 'underlineLink')
                 url = a.get('href')
                 key = url.replace('/', '_')
                 ret['zipPic'][key] = zipPic
                 ret['contentPage'][key] = url
     except Exception, e:
         util.printException()
         return (None, e)
예제 #25
0
def loadPointMatches(img1_filename,
                     img2_filename,
                     directory,
                     params,
                     epsilon=0.00001,
                     verbose=True):
    """ Attempts to load point matches from filename1 + '.' + filename2 + ".pointmatches.csv" if it exists,
      returning a list of PointMatch instances or None.
      params: dictionary of parameters with which pointmatches are wanted now,
              to compare with parameter with which pointmatches were made.
              In case of mismatch, return None.
      epsilon: allowed error when comparing floating-point values. """
    try:
        csvpath = os.path.join(
            directory,
            basename(img1_filename) + '.' + basename(img2_filename) +
            ".pointmatches.csv")
        if not os.path.exists(csvpath):
            if verbose:
                syncPrint("No stored pointmatches found at %s" % csvpath)
            return None
        with open(csvpath, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            # First line contains parameter names, second line their values
            if not checkParams(params, reader.next(), reader.next(), epsilon):
                return None
            if next(reader, None) is None:  # skip header with column names
                return []  # zero pointmatches
            pointmatches = PointMatches.fromRows(reader).pointmatches
            if verbose:
                syncPrint("Loaded %i pointmatches for %s, %s" %
                          (len(pointmatches), img1_filename, img2_filename))
            return pointmatches
    except:
        syncPrint("Could not load pointmatches for pair %s, %s" %
                  (img1_filename, img2_filename))
        printException()
        return None
예제 #26
0
    def parseContent(self, page):
        ret = {'contentPics':[], 'contentPicCaptions':[], 'embPics':[]}
        try:
            soup = BS(page)           
            articleDiv = soup.find('div', id='article') 
            siteTitleH2 = articleDiv.find('h2', 'sitetitle')
            ret['siteTitle'] = siteTitleH2.a.text

            titleH1 = articleDiv.find('h1')
            ret['title'] = titleH1.text.strip()

            picDiv = soup.find('div', 'watermark')
            if picDiv and picDiv.parent['class'] == ['contentImage', 'floatNone']:
                ret['contentPics'].append(picDiv.a.img.get('src'))
                ret['contentPicCaptions'].append(picDiv.next_sibling.text)

            li = soup.find('li', 'downloadlinkstatic') 
            if li != None:
                src = li.a.get('href')
                ret['contentMp3'] = src
            else:
                li = soup.find('li', 'listenlink')
                url = li.a.get('href')
                ret['contentMp3Page'] = url
                              
            contentDiv = articleDiv.find('div', 'articleContent') 
            dateDiv = contentDiv.find('div', 'dateblock')
            date = dateDiv.text.strip()
            ret['date'] = date
            
            contentZoomMeDiv = contentDiv.find('div', 'zoomMe') 
            #delete mp3 player part
            mp3H5 = contentZoomMeDiv.find('h5', 'tagaudiotitle')
            if mp3H5:
                print mp3H5
                div = mp3H5.find_next_sibling('div', 'mediaplayer audioplayer')   
                div.decompose()
                mp3H5.decompose()

            #delete script
            for ele in contentZoomMeDiv.find_all('script'):
                ele.decompose()
            
            for ul in contentZoomMeDiv.find_all('ul'):
                if ul.find('li', 'playlistlink') or \
                    ul.find('li', 'listenlink'):
                        ul.decompose()

            #print contentZoomMeDiv
            #delete until first p
            if contentZoomMeDiv.find('div', 'wordclick'):
                iterContent = contentZoomMeDiv.find('div', 'wordclick')
            else:
                iterContent = contentZoomMeDiv

            for tag in iterContent.find_all():
                if tag.name != None:
                    if tag.name != 'p' and tag.name != 'br':
                        tag.decompose()
                    else:
                        break
            
            keepDelete = False
            for tag in iterContent.find_all():
                if tag.name != None:
                    if keepDelete == False:
                        if tag.name == 'div':
                            tagClass = tag.get('class')
                            if tag.find('p') and self.reXHX.search(tag.p.text):
                                oriTag = tag
                                tag =  tag.p
                                if tag.find_next_sibling('h5', 'tagaudiotitle') or \
                                        tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']):
                                    keepDelete = True
                                    tag.decompose()

                                #if  tag.find('span') and self.reXHX.search(tag.span.text):
                                #    print tag
                                #    #keepDelete = True
                                #    #tag.span.decompose()

                                #elif tag.find('em') and self.reXHX.search(tag.em.text):
                                #    print tag
                                #    #keepDelete = True
                                #    #tag.em.decompose()

                                #elif self.reXHX.search(tag.text):
                                #    print tag
                                #    #keepDelete = True
                                #    #tag.decompose()
                            
                            elif tagClass:
                                if 'infgraphicsAttach' in tagClass:
                                    tag.decompose()
                                if 'boxwidget' in tagClass:
                                    #boxwidget w_Quiz2c w_QuizInside
                                    tag.decompose()
                        elif tag.name == 'iframe':
                            tag.decompose()

                        elif tag.name == 'p':
                            #check em
                            if  tag.find('span') and self.reXHX.search(tag.span.text):
                                #print tag
                                #keepDelete = True
                                #tag.span.decompose()
                                if tag.find_next_sibling('h5', 'tagaudiotitle') or \
                                        tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']):
                                    keepDelete = True
                                    tag.decompose()

                            elif tag.find('em') and self.reXHX.search(tag.em.text):
                                #print tag
                                #keepDelete = True
                                #tag.em.decompose()
                                if tag.find_next_sibling('h5', 'tagaudiotitle') or \
                                        tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']):
                                    keepDelete = True
                                    tag.decompose()

                            elif self.reXHX.search(tag.text):
                                print tag
                                #keepDelete = True
                                #tag.decompose()
                                if tag.find_next_sibling('h5', 'tagaudiotitle') or \
                                        tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']):
                                    keepDelete = True
                                    tag.decompose()
                                  
                    else:
                        tag.decompose()

            #print contentZoomMeDiv
            #filt photos in content
            for tag in iterContent.find_all():
                if tag.name == 'div' and tag.get('class'):
                    if 'embedded_content_object' in tag.get('class'):
                        embDiv = tag 
                        embImgDiv = embDiv.find('div', 'contentImage')
                        if embImgDiv:
                            embImg = embImgDiv.find('img')
                            src = embImg.get('src')
                            ret['embPics'].append(src)
                            newSrc = os.path.basename(urlparse.urlparse(src).path)
                            embImg['src'] = newSrc
                            tag.replace_with(embImgDiv)
                        else:
                            tag.decompose()
                
            ret['content'] = "%s" % contentZoomMeDiv.prettify().encode('utf-8')
            
        except Exception, e:
            util.printException()
            return (None, e)
예제 #27
0
    def parseContent(self, page):
        ret = {'contentPics': [], 'contentPicCaptions': [], 'embPics': []}
        try:
            soup = BS(page)
            articleDiv = soup.find('div', id='article')
            siteTitleH2 = articleDiv.find('h2', 'sitetitle')
            ret['siteTitle'] = siteTitleH2.a.text

            titleH1 = articleDiv.find('h1')
            ret['title'] = titleH1.text.strip()

            picDiv = soup.find('div', 'watermark')
            if picDiv and picDiv.parent['class'] == [
                    'contentImage', 'floatNone'
            ]:
                ret['contentPics'].append(picDiv.a.img.get('src'))
                ret['contentPicCaptions'].append(picDiv.next_sibling.text)

            li = soup.find('li', 'downloadlinkstatic')
            if li != None:
                src = li.a.get('href')
                ret['contentMp3'] = src
            else:
                li = soup.find('li', 'listenlink')
                url = li.a.get('href')
                ret['contentMp3Page'] = url

            contentDiv = articleDiv.find('div', 'articleContent')
            dateDiv = contentDiv.find('div', 'dateblock')
            date = dateDiv.text.strip()
            ret['date'] = date

            contentZoomMeDiv = contentDiv.find('div', 'zoomMe')
            #delete mp3 player part
            mp3H5 = contentZoomMeDiv.find('h5', 'tagaudiotitle')
            if mp3H5:
                print mp3H5
                div = mp3H5.find_next_sibling('div', 'mediaplayer audioplayer')
                div.decompose()
                mp3H5.decompose()

            #delete script
            for ele in contentZoomMeDiv.find_all('script'):
                ele.decompose()

            for ul in contentZoomMeDiv.find_all('ul'):
                if ul.find('li', 'playlistlink') or \
                    ul.find('li', 'listenlink'):
                    ul.decompose()

            #print contentZoomMeDiv
            #delete until first p
            if contentZoomMeDiv.find('div', 'wordclick'):
                iterContent = contentZoomMeDiv.find('div', 'wordclick')
            else:
                iterContent = contentZoomMeDiv

            for tag in iterContent.find_all():
                if tag.name != None:
                    if tag.name != 'p' and tag.name != 'br':
                        tag.decompose()
                    else:
                        break

            keepDelete = False
            for tag in iterContent.find_all():
                if tag.name != None:
                    if keepDelete == False:
                        if tag.name == 'div':
                            tagClass = tag.get('class')
                            if tag.find('p') and self.reXHX.search(tag.p.text):
                                oriTag = tag
                                tag = tag.p
                                if tag.find_next_sibling('h5', 'tagaudiotitle') or \
                                        tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']):
                                    keepDelete = True
                                    tag.decompose()

                                #if  tag.find('span') and self.reXHX.search(tag.span.text):
                                #    print tag
                                #    #keepDelete = True
                                #    #tag.span.decompose()

                                #elif tag.find('em') and self.reXHX.search(tag.em.text):
                                #    print tag
                                #    #keepDelete = True
                                #    #tag.em.decompose()

                                #elif self.reXHX.search(tag.text):
                                #    print tag
                                #    #keepDelete = True
                                #    #tag.decompose()

                            elif tagClass:
                                if 'infgraphicsAttach' in tagClass:
                                    tag.decompose()
                                if 'boxwidget' in tagClass:
                                    #boxwidget w_Quiz2c w_QuizInside
                                    tag.decompose()
                        elif tag.name == 'iframe':
                            tag.decompose()

                        elif tag.name == 'p':
                            #check em
                            if tag.find('span') and self.reXHX.search(
                                    tag.span.text):
                                #print tag
                                #keepDelete = True
                                #tag.span.decompose()
                                if tag.find_next_sibling('h5', 'tagaudiotitle') or \
                                        tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']):
                                    keepDelete = True
                                    tag.decompose()

                            elif tag.find('em') and self.reXHX.search(
                                    tag.em.text):
                                #print tag
                                #keepDelete = True
                                #tag.em.decompose()
                                if tag.find_next_sibling('h5', 'tagaudiotitle') or \
                                        tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']):
                                    keepDelete = True
                                    tag.decompose()

                            elif self.reXHX.search(tag.text):
                                print tag
                                #keepDelete = True
                                #tag.decompose()
                                if tag.find_next_sibling('h5', 'tagaudiotitle') or \
                                        tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']):
                                    keepDelete = True
                                    tag.decompose()

                    else:
                        tag.decompose()

            #print contentZoomMeDiv
            #filt photos in content
            for tag in iterContent.find_all():
                if tag.name == 'div' and tag.get('class'):
                    if 'embedded_content_object' in tag.get('class'):
                        embDiv = tag
                        embImgDiv = embDiv.find('div', 'contentImage')
                        if embImgDiv:
                            embImg = embImgDiv.find('img')
                            src = embImg.get('src')
                            ret['embPics'].append(src)
                            newSrc = os.path.basename(
                                urlparse.urlparse(src).path)
                            embImg['src'] = newSrc
                            tag.replace_with(embImgDiv)
                        else:
                            tag.decompose()

            ret['content'] = "%s" % contentZoomMeDiv.prettify().encode('utf-8')

        except Exception, e:
            util.printException()
            return (None, e)
    def parseContent(self, page):
        page = page.decode('gbk')
        ret = {'stockBasicInfo':{}, 'stockOtherInfo':{}}
        transMapCn2En = {
            u'公司代码': 'compCode',
            u'注册地址': 'registerAddr',
            u'法定代表人': 'legalRepresentative',
            u'董事会秘书姓名': 'boardSecretary',
            u'E-mail': 'E-mail',
            u'联系电话': 'phone',
            u'网址':'website',
            u'SSE行业': 'sseSector',
            u'是否上证180样本股': 'isSh180',
            u'是否境外上市': 'isOverseasListing',
            u'境外上市地': 'OverseasListingPlace'
        }

        try:
            soup = BS(page)           
            titleSpan = soup.find('span',  class_ ='pagetitle' ) 
            br = titleSpan.find('br')
            cmpNameAndCode = br.previous_element
            cmpName, code = re.split("\s+", cmpNameAndCode.strip())
            table = soup.find('table', width="100%", cellspacing="5", cellpadding="0", border="0")
            contentTd = table.find('td', class_="content", width="100%", valign="top")
            #contentTableList = contentTd.find_all('table', class="content", width="100%", bgcolor="#FFFFFF", align="center")
            tdList = contentTd.find_all('td', class_="content_b")
            for td in tdList:
                key = re.sub(r"\s+", "", td.text.strip().strip(':'))
                value = td.find_next_sibling('td').text.strip()
                if key == u'股票代码(A股/B股)':
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['code'] = valueList[0]
                    if len(valueList) == 1 or valueList[1] == '-':
                        ret['stockBasicInfo']['B_code'] = ''
                    else:
                        ret['stockBasicInfo']['B_code'] = valueList[1]
                elif key == u'上市日(A股/B股)':
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['A_IPO_date'] = valueList[0]
                    if len(valueList) == 1 or valueList[1] == '-':
                        ret['stockBasicInfo']['B_IPO_date'] = ''
                    else:
                        ret['stockBasicInfo']['B_IPO_date'] = valueList[1]
                elif key == u'可转债简称(代码)':
                    valueListMatch = re.search(ur"(\.+)\s*((.+))", value)
                    if valueListMatch != None:
                        groups = valueListMatch.groups()
                        ret['stockBasicInfo']['convertibleBondAbbr'] = groups[0]
                        ret['stockBasicInfo']['convertibleBondCode'] = groups[1]
                    else:
                        ret['stockBasicInfo']['convertibleBondAbbr'] = ''
                        ret['stockBasicInfo']['convertibleBondCode'] = ''
                elif key == u'公司简称(中/英)':
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['compNameAbbr'] = valueList[0]
                    if len(valueList) == 1 or valueList[1] == '-':
                        ret['stockBasicInfo']['compNameAbbrEn'] = ''
                    else:
                        ret['stockBasicInfo']['compNameAbbrEn'] = valueList[1]
                elif key == u'公司全称(中/英)':
                    valueList = value.split("\n")
                    ret['stockBasicInfo']['compNameCn'] = valueList[0]
                    ret['stockBasicInfo']['compNameEn'] = valueList[1]
                elif key == u'通讯地址(邮编)':
                    value = value.replace("\n", "")
                    valueListMatch = re.search(ur"(\.+)\s*((.+))", value)
                    if valueListMatch != None:
                        groups = valueListMatch.groups()
                        ret['stockBasicInfo']['contactAddr'] = groups[0]
                        ret['stockBasicInfo']['postcode'] = groups[1]
                    else:
                        ret['stockBasicInfo']['contactAddr'] = ''
                        ret['stockBasicInfo']['postcode'] = ''
                elif key.startswith(u'CSRC行业'):
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['sector'] = "/".join(valueList)
                    ret['stockBasicInfo']['sector1'] = valueList[0]
                    ret['stockBasicInfo']['sector2'] = valueList[1]
                    if valueList[2] == '-':
                        ret['stockBasicInfo']['sector3'] = valueList[1]
                    else:
                        ret['stockBasicInfo']['sector3'] = valueList[2]
                elif key == u'所属省/直辖市':
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['province'] = valueList[0]
                    if len(valueList) == 1 or valueList[1] == '-':
                        ret['stockBasicInfo']['city'] = ''
                    else:
                        ret['stockBasicInfo']['city'] = valueList[1]
                elif key == u'A股状态/B股状态':
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['A_status'] = valueList[0]
                    if len(valueList) == 1 or valueList[1] == '-':
                        ret['stockBasicInfo']['B_status'] = ''
                    else:
                        ret['stockBasicInfo']['B_status'] = valueList[1]
                else: 
                    if key in transMapCn2En:
                        ret['stockBasicInfo'][transMapCn2En[key]] = value.strip()
                    else:
                        ret['stockBasicInfo'][key] = value.strip()
        except Exception, e:
            util.printException()
            return (None, e)
예제 #29
0
def extractBlockMatches(filepath1, filepath2, params, paramsSIFT, properties,
                        csvDir, exeload, load):
    """
  filepath1: the file path to an image of a section.
  filepath2: the file path to an image of another section.
  params: dictionary of parameters necessary for BlockMatching.
  exeload: an ExecutorService for parallel loading of image files.
  load: a function that knows how to load the image from the filepath.

  return False if the CSV file already exists, True if it has to be computed.
  """

    # Skip if pointmatches CSV file exists already:
    csvpath = os.path.join(
        csvDir,
        basename(filepath1) + '.' + basename(filepath2) + ".pointmatches.csv")
    if os.path.exists(csvpath):
        return False

    try:

        # Load files in parallel
        futures = [
            exeload.submit(Task(load, filepath1)),
            exeload.submit(Task(load, filepath2))
        ]

        fp1 = futures[0].get(
        )  # FloatProcessor, already Gaussian-blurred, contrast-corrected and scaled!
        fp2 = futures[1].get()  # FloatProcessor, idem

        # Define points from the mesh
        sourcePoints = ArrayList()
        # List to fill
        sourceMatches = ArrayList(
        )  # of PointMatch from filepath1 to filepath2

        # Don't use blockmatching if the dimensions are different
        use_blockmatching = fp1.getWidth() == fp2.getWidth() and fp1.getHeight(
        ) == fp2.getHeight()

        # Fill the sourcePoints
        mesh = TransformMesh(params["meshResolution"], fp1.width, fp1.height)
        PointMatch.sourcePoints(mesh.getVA().keySet(), sourcePoints)
        syncPrintQ("Extracting block matches for \n S: " + filepath1 +
                   "\n T: " + filepath2 + "\n  with " +
                   str(sourcePoints.size()) + " mesh sourcePoints.")
        # Run
        BlockMatching.matchByMaximalPMCCFromPreScaledImages(
            fp1,
            fp2,
            params["scale"],  # float
            params["blockRadius"],  # X
            params["blockRadius"],  # Y
            params["searchRadius"],  # X
            params["searchRadius"],  # Y
            params["minR"],  # float
            params["rod"],  # float
            params["maxCurvature"],  # float
            sourcePoints,
            sourceMatches)

        # At least some should match to accept the translation
        if len(sourceMatches) < max(20, len(sourcePoints) / 5) / 2:
            syncPrintQ(
                "Found only %i blockmatching pointmatches (from %i source points)"
                % (len(sourceMatches), len(sourcePoints)))
            syncPrintQ(
                "... therefore invoking SIFT pointmatching for:\n  S: " +
                basename(filepath1) + "\n  T: " + basename(filepath2))
            # Can fail if there is a shift larger than the searchRadius
            # Try SIFT features, which are location independent
            #
            # Images are now scaled: load originals
            futures = [
                exeload.submit(
                    Task(loadFloatProcessor,
                         filepath1,
                         params,
                         paramsSIFT,
                         scale=False)),
                exeload.submit(
                    Task(loadFloatProcessor,
                         filepath2,
                         params,
                         paramsSIFT,
                         scale=False))
            ]

            fp1 = futures[0].get()  # FloatProcessor, original
            fp2 = futures[1].get()  # FloatProcessor, original

            # Images can be of different size: scale them the same way
            area1 = fp1.width * fp1.height
            area2 = fp2.width * fp2.height

            if area1 == area2:
                paramsSIFT1 = paramsSIFT.clone()
                paramsSIFT1.maxOctaveSize = int(
                    max(properties.get("SIFT_max_size", 2048),
                        fp1.width * params["scale"]))
                paramsSIFT1.minOctaveSize = int(paramsSIFT1.maxOctaveSize /
                                                pow(2, paramsSIFT1.steps))
                paramsSIFT2 = paramsSIFT1
            else:
                bigger, smaller = (fp1, fp2) if area1 > area2 else (fp2, fp1)
                target_width_bigger = int(
                    max(1024, bigger.width * params["scale"]))
                if 1024 == target_width_bigger:
                    target_width_smaller = int(1024 * float(smaller.width) /
                                               bigger.width)
                else:
                    target_width_smaller = smaller.width * params["scale"]
                #
                paramsSIFT1 = paramsSIFT.clone()
                paramsSIFT1.maxOctaveSize = target_width_bigger
                paramsSIFT1.minOctaveSize = int(paramsSIFT1.maxOctaveSize /
                                                pow(2, paramsSIFT1.steps))
                paramsSIFT2 = paramsSIFT.clone()
                paramsSIFT2.maxOctaveSize = target_width_smaller
                paramsSIFT2.minOctaveSize = int(paramsSIFT2.maxOctaveSize /
                                                pow(2, paramsSIFT2.steps))

            ijSIFT1 = SIFT(FloatArray2DSIFT(paramsSIFT1))
            features1 = ArrayList()  # of Point instances
            ijSIFT1.extractFeatures(fp1, features1)

            ijSIFT2 = SIFT(FloatArray2DSIFT(paramsSIFT2))
            features2 = ArrayList()  # of Point instances
            ijSIFT2.extractFeatures(fp2, features2)
            # Vector of PointMatch instances
            sourceMatches = FloatArray2DSIFT.createMatches(
                features1,
                features2,
                params.get(
                    "max_sd",
                    1.5),  # max_sd: maximal difference in size (ratio max/min)
                TranslationModel2D(),
                params.get("max_id", Double.MAX_VALUE
                           ),  # max_id: maximal distance in image space
                params.get("rod", 0.9))  # rod: ratio of best vs second best

        # Store pointmatches
        savePointMatches(os.path.basename(filepath1),
                         os.path.basename(filepath2), sourceMatches, csvDir,
                         params)

        return True
    except:
        printException()
예제 #30
0
def ensurePointMatches(filepaths, csvDir, params, paramsSIFT, n_adjacent,
                       properties):
    """ If a pointmatches csv file doesn't exist, will create it. """
    w = ParallelTasks("ensurePointMatches",
                      exe=newFixedThreadPool(properties["n_threads"]))
    exeload = newFixedThreadPool()
    try:
        if properties.get("use_SIFT", False):
            syncPrintQ("use_SIFT is True")
            # Pre-extract SIFT features for all images first
            # ensureSIFTFeatures returns the features list so the Future will hold it in memory: can't hold onto them
            # therefore consume the tasks in chunks:
            chunk_size = properties["n_threads"] * 2
            count = 1
            for result in w.chunkConsume(
                    chunk_size,  # tasks to submit before starting to wait for futures
                (Task(ensureSIFTFeatures,
                      filepath,
                      paramsSIFT,
                      properties,
                      csvDir,
                      validateByFileExists=properties.get(
                          "SIFT_validateByFileExists"))
                 for filepath in filepaths)):
                count += 1
                if 0 == count % chunk_size:
                    syncPrintQ(
                        "Completed extracting or validating SIFT features for %i images."
                        % count)
            w.awaitAll()
            syncPrintQ(
                "Completed extracting or validating SIFT features for all images."
            )
            # Compute pointmatches across adjacent sections
            count = 1
            for result in w.chunkConsume(
                    chunk_size,
                    generateSIFTMatches(filepaths, n_adjacent, params,
                                        paramsSIFT, properties, csvDir)):
                count += 1
                syncPrintQ("Completed SIFT pointmatches %i/%i" %
                           (count, len(filepaths) * n_adjacent))
        else:
            # Use blockmatches
            syncPrintQ("using blockmatches")
            loadFPMem = SoftMemoize(lambda path: loadFloatProcessor(
                path, params, paramsSIFT, scale=True),
                                    maxsize=properties["n_threads"] +
                                    n_adjacent)
            count = 1
            for result in w.chunkConsume(
                    properties["n_threads"],
                    pointmatchingTasks(filepaths, csvDir, params, paramsSIFT,
                                       n_adjacent, exeload, properties,
                                       loadFPMem)):
                if result:  # is False when CSV file already exists
                    syncPrintQ("Completed %i/%i" %
                               (count, len(filepaths) * n_adjacent))
                count += 1
            syncPrintQ("Awaiting all remaining pointmatching tasks to finish.")
        w.awaitAll()
        syncPrintQ("Finished all pointmatching tasks.")
    except:
        printException()
    finally:
        exeload.shutdown()
        w.destroy()
예제 #31
0
    def parseContent(self, page):
        page = page.decode('gbk')
        ret = {'stockBasicInfo': {}, 'stockOtherInfo': {}}
        transMapCn2En = {
            u'公司代码': 'compCode',
            u'注册地址': 'registerAddr',
            u'法定代表人': 'legalRepresentative',
            u'董事会秘书姓名': 'boardSecretary',
            u'E-mail': 'E-mail',
            u'联系电话': 'phone',
            u'网址': 'website',
            u'SSE行业': 'sseSector',
            u'是否上证180样本股': 'isSh180',
            u'是否境外上市': 'isOverseasListing',
            u'境外上市地': 'OverseasListingPlace'
        }

        try:
            soup = BS(page)
            titleSpan = soup.find('span', class_='pagetitle')
            br = titleSpan.find('br')
            cmpNameAndCode = br.previous_element
            cmpName, code = re.split("\s+", cmpNameAndCode.strip())
            table = soup.find('table',
                              width="100%",
                              cellspacing="5",
                              cellpadding="0",
                              border="0")
            contentTd = table.find('td',
                                   class_="content",
                                   width="100%",
                                   valign="top")
            #contentTableList = contentTd.find_all('table', class="content", width="100%", bgcolor="#FFFFFF", align="center")
            tdList = contentTd.find_all('td', class_="content_b")
            for td in tdList:
                key = re.sub(r"\s+", "", td.text.strip().strip(':'))
                value = td.find_next_sibling('td').text.strip()
                if key == u'股票代码(A股/B股)':
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['code'] = valueList[0]
                    if len(valueList) == 1 or valueList[1] == '-':
                        ret['stockBasicInfo']['B_code'] = ''
                    else:
                        ret['stockBasicInfo']['B_code'] = valueList[1]
                elif key == u'上市日(A股/B股)':
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['A_IPO_date'] = valueList[0]
                    if len(valueList) == 1 or valueList[1] == '-':
                        ret['stockBasicInfo']['B_IPO_date'] = ''
                    else:
                        ret['stockBasicInfo']['B_IPO_date'] = valueList[1]
                elif key == u'可转债简称(代码)':
                    valueListMatch = re.search(ur"(\.+)\s*((.+))", value)
                    if valueListMatch != None:
                        groups = valueListMatch.groups()
                        ret['stockBasicInfo']['convertibleBondAbbr'] = groups[
                            0]
                        ret['stockBasicInfo']['convertibleBondCode'] = groups[
                            1]
                    else:
                        ret['stockBasicInfo']['convertibleBondAbbr'] = ''
                        ret['stockBasicInfo']['convertibleBondCode'] = ''
                elif key == u'公司简称(中/英)':
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['compNameAbbr'] = valueList[0]
                    if len(valueList) == 1 or valueList[1] == '-':
                        ret['stockBasicInfo']['compNameAbbrEn'] = ''
                    else:
                        ret['stockBasicInfo']['compNameAbbrEn'] = valueList[1]
                elif key == u'公司全称(中/英)':
                    valueList = value.split("\n")
                    ret['stockBasicInfo']['compNameCn'] = valueList[0]
                    ret['stockBasicInfo']['compNameEn'] = valueList[1]
                elif key == u'通讯地址(邮编)':
                    value = value.replace("\n", "")
                    valueListMatch = re.search(ur"(\.+)\s*((.+))", value)
                    if valueListMatch != None:
                        groups = valueListMatch.groups()
                        ret['stockBasicInfo']['contactAddr'] = groups[0]
                        ret['stockBasicInfo']['postcode'] = groups[1]
                    else:
                        ret['stockBasicInfo']['contactAddr'] = ''
                        ret['stockBasicInfo']['postcode'] = ''
                elif key.startswith(u'CSRC行业'):
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['sector'] = "/".join(valueList)
                    ret['stockBasicInfo']['sector1'] = valueList[0]
                    ret['stockBasicInfo']['sector2'] = valueList[1]
                    if valueList[2] == '-':
                        ret['stockBasicInfo']['sector3'] = valueList[1]
                    else:
                        ret['stockBasicInfo']['sector3'] = valueList[2]
                elif key == u'所属省/直辖市':
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['province'] = valueList[0]
                    if len(valueList) == 1 or valueList[1] == '-':
                        ret['stockBasicInfo']['city'] = ''
                    else:
                        ret['stockBasicInfo']['city'] = valueList[1]
                elif key == u'A股状态/B股状态':
                    valueList = re.split(r"\s*/\s*", value)
                    ret['stockBasicInfo']['A_status'] = valueList[0]
                    if len(valueList) == 1 or valueList[1] == '-':
                        ret['stockBasicInfo']['B_status'] = ''
                    else:
                        ret['stockBasicInfo']['B_status'] = valueList[1]
                else:
                    if key in transMapCn2En:
                        ret['stockBasicInfo'][
                            transMapCn2En[key]] = value.strip()
                    else:
                        ret['stockBasicInfo'][key] = value.strip()
        except Exception, e:
            util.printException()
            return (None, e)
예제 #32
0
def savePointMatches(img_filename1,
                     img_filename2,
                     pointmatches,
                     directory,
                     params,
                     coords_header=["x1", "y1", "x2", "y2"]):
    filename = basename(img_filename1) + '.' + basename(
        img_filename2) + ".pointmatches.csv"
    path = os.path.join(directory, filename)
    msg = [str(len(pointmatches))]
    ra = None
    try:
        """
    with open(path, 'w') as csvfile:
      w = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
      # First two rows: parameter names and values
      keys = params.keys()
      msg.append("keys: " + ",".join(map(str, keys)))
      msg.append("vals: " + ",".join(str(params[key]) for key in keys))
      #for pm in pointmatches:
      #  msg.append(", ".join(map(str, PointMatches.asRow(pm))))
      w.writerow(keys)
      w.writerow(tuple(params[key] for key in keys))
      # PointMatches header
      if 0 == len(pointmatches):
        # Can't know whether there are 2 or 3 dimensions per coordinate
        w.writerow(coords_header)
      else:
        w.writerow(PointMatches.csvHeader(next(iter(pointmatches)))) # support both lists and sets
      # One PointMatch per row
      for pm in pointmatches:
        w.writerow(PointMatches.asRow(pm))
      # Ensure it's written
      csvfile.flush()
      os.fsync(csvfile.fileno())
    """
        # DEBUG write differently, the above FAILS for ~20 out of 130,000 files
        lines = []
        keys = params.keys()
        lines.append(",".join(map(str, keys)))
        lines.append(",".join(map(str, (params[key] for key in keys))))
        header = coords_header if 0 == len(pointmatches) \
                               else PointMatches.csvHeader(next(iter(pointmatches)))
        lines.append(",".join(header))
        for pm in pointmatches:
            p1 = pm.getP1().getW()  # a double[] array
            p2 = pm.getP2().getW()  # a double[] array
            lines.append("%f,%f,%f,%f" % (p1[0], p1[1], p2[0], p2[1]))
        body = "\n".join(lines)
        ra = RandomAccessFile(path, 'rw')
        ra.writeBytes(body)
        ra.getFD().sync()  # ensure it's written
    except:
        syncPrintQ("Failed to save pointmatches at %s\n%s" %
                   (path, "\n".join(msg)))
        printException()
        if os.path.exists(path):
            os.remove(path)
    finally:
        if ra is not None:
            ra.close()
예제 #33
0
    def parseContent(self, page):
        transMapCn2En = { \
            u'公司代码': 'compCode',
            u'公司简称': 'compNameAbbr',
            u'公司全称': 'compNameCn',
            u'英文名称': 'compNameEn',
            u'注册地址': 'regAddr',
            u'A股代码' : 'code',
            u'A股简称' : 'nameCn',
            u'A股上市日期': 'A_IPO_date',
            u'A股总股本': 'A_totalShares',
            u'A股流通股本':'A_circulationStock',
            u'B股代码': 'B_code',
            u'B股简称': 'B_nameCn',
            u'B股上市日期': 'B_IPO_date',
            u'B股总股本': 'B_totalShares',
            u'B股流通股本': 'B_circulationStock',
            u'地区': 'region',
            u'省份': 'province',
            u'城市': 'city',
            u'所属行业': 'sector',
            u'公司网址': 'webSite'
        }
        transMapEn2Cn = {}
        for k in transMapCn2En:
            transMapEn2Cn[transMapCn2En[k]] = k

        ret = {'stockList':[], 'headerList':[]}
        try:
            reEmpty = re.compile(r'\s+')
            soup = BS(page)           
            table = soup.find('table')
            trList  = table.find_all('tr')
            trHeader = trList[0]

            thList = trHeader.find_all('td')
            headerList = [ reEmpty.sub('', td.text.strip()) for td in thList ]
            ret['headerList'] = headerList

            for tr in trList[1:]: # for each stock
                tdList = tr.find_all('td')
                tdTextList = [ td.text.strip() for td in tdList ]
                
                stockDict = {}
                for i in range(len(tdTextList)):
                    enKey = transMapCn2En[headerList[i]]
                    value = tdTextList[i]
                    if enKey in ('A_totalShares', 'A_circulationStock', 'B_totalShares', 'B_circulationStock'):# pull out the ','
                        value = value.replace(",", "")
                    stockDict[enKey]  = tdTextList[i]        
                    
                #J 金融业
                if 'sector' in stockDict:
                    theSector = stockDict['sector']
                    stockDict['sectorCode'] = theSector[0]
                    stockDict['sectorName'] = theSector[1:]

                ret['stockList'].append(stockDict)
        except Exception, e:
            util.printException()
            return (None, e)