def doOneTask(self, task): #0. sign it self.signTask(task) task.status = 'parsing' #1. call handler if task.handler not in self.hs: #from MainPageHandler import MainPageHandler m = __import__(task.handler) c = getattr(m, task.handler) self.hs[task.handler] = c() h = self.hs[task.handler] #2. check type and do it isOK = True if task.taskType in ('media', 'page'): #deal with it output = {} try: output = h.parse(task) except Exception, e: # todo set task failed here util.printException() myLogger.error("sth wrong [%s][%s] in parsing, set task[%d] failed" % (task.handler, e, task.id)) isOK = False if 'newTasks' in output: for t in output['newTasks']: # get a taskId from my manager self.m.packTask(t) self.signTask(t) self.m.addTask(t)
def run(self): myLogger.info("worker %s begin to run" % (self.name)) myLogger.info("my manager's inQueue size [%d]" % (self.inQueue.qsize())) self.hungerly = False while True: try: # get get = None try: get = self.inQueue.get(timeout=1) self.hungerly = False except Queue.Empty, e: self.hungerly = True myLogger.debug('thread [%s] is hungerly now' % (self.name)) if self.m.shouldExit == True: myLogger.debug('thread [%s] is exiting' % (self.name)) break if self.hungerly == False: # do myLogger.info("thread [%s], get task[%s] from queue" % (self.name, get.id)) time.sleep(3) # put self.doOneTask(get) else: time.sleep(2) except Exception, e: util.printException() myLogger.error('sth wrong[%s] in thread [%s]' % (e, self.name)) get.status = 'failed' if get.msg != '': get.msg = util.exprException()
def download(self, url, to, postdata={}, timeout=60, times=3, safeFetch=True): myLogger.info("fetcher download from [%s] to [%s]" % (url, to)) ret = None while times!=0: status='' if safeFetch == True: resp = self.safeFetch(url, postdata, timeout) else: resp, status = self.fetch(url, postdata, timeout) if util.mkdir(os.path.dirname(to)) == False: times -= 1 ret = None continue if status == 'OK' or (status == '' and resp != None): # fetch ok CHUNK = 1024 * 1024 * 5 with open(to, 'wb') as f: ret = True try: while True: chunk = resp.read(CHUNK) if not chunk: break f.write(chunk) except Exception, e: msg = util.exprException() util.printException() ret = None if ret == True: break times -= 1
def doOneTask(self, task): #0. sign it self.signTask(task) task.status = 'parsing' #1. call handler if task.handler not in self.hs: #from MainPageHandler import MainPageHandler m = __import__(task.handler) c = getattr(m, task.handler) self.hs[task.handler] = c() h = self.hs[task.handler] #2. check type and do it isOK = True if task.taskType in ('media', 'page'): #deal with it output = {} try: output = h.parse(task) except Exception, e: # todo set task failed here util.printException() myLogger.error( "sth wrong [%s][%s] in parsing, set task[%d] failed" % (task.handler, e, task.id)) isOK = False if 'newTasks' in output: for t in output['newTasks']: # get a taskId from my manager self.m.packTask(t) self.signTask(t) self.m.addTask(t)
def parseContent(self, page): page = page.decode('gbk') ret = {'stockList': [], 'nextPage': ''} try: soup = BS(page) theA = soup.find('a', text='下一页') if theA != None and theA['href'] != '': ret['nextPage'] = theA['href'] table = soup.find('table', width="100%", cellspacing="1", cellpadding="2", border="0", bgcolor="#337fb2") trList = table.find_all('tr') for tr in trList[1:]: tdList = tr.find_all('td') code = tdList[0].text name = tdList[1].text href = tdList[0].a['href'] ret['stockList'].append({ 'code': code, 'name': name, 'href': href }) except Exception, e: util.printException() return (None, e)
def ensureSIFTFeatures(filepath, paramsSIFT, properties, csvDir, validateByFileExists=False): """ filepath: to the image from which SIFT features have been or have to be extracted. params: dict of registration parameters, including the key "scale". paramsSIFT: FloatArray2DSIFT.Params instance. csvDir: directory into which serialized features have been or will be saved. load: function to load an image as an ImageProcessor from the filepath. validateByFileExists: whether to merely check that the .obj file exists as a quick form of validation. First check if serialized features exist for the image, and if the Params match. Otherwise extract the features and store them serialized. Returns the ArrayList of Feature instances. """ path = os.path.join(csvDir, os.path.basename(filepath) + ".SIFT-features.obj") if validateByFileExists: if os.path.exists(path): return True # An ArrayList whose last element is a mpicbg.imagefeatures.FloatArray2DSIFT.Param # and all other elements are mpicbg.imagefeatures.Feature features = deserialize(path) if os.path.exists(path) else None if features: if features.get(features.size() - 1).equals(paramsSIFT): features.remove(features.size() - 1) # removes the Params syncPrintQ("Loaded %i SIFT features for %s" % (features.size(), os.path.basename(filepath))) return features else: # Remove the file: paramsSIFT have changed os.remove(path) # Else, extract de novo: try: # Extract features imp = loadImp(filepath) ip = imp.getProcessor() paramsSIFT = paramsSIFT.clone() ijSIFT = SIFT(FloatArray2DSIFT(paramsSIFT)) features = ArrayList() # of Feature instances ijSIFT.extractFeatures(ip, features) ip = None imp.flush() imp = None features.add( paramsSIFT ) # append Params instance at the end for future validation serialize(features, path) features.remove(features.size() - 1) # to return without the Params for immediate use syncPrintQ("Extracted %i SIFT features for %s" % (features.size(), os.path.basename(filepath))) except: printException() return features
def parseContent(self, page): ret = {'excelUrl':''} try: soup = BS(page) td = soup.find('td', align="right", width="60px", valign="bottom") print td print td.a['href'] except Exception, e: util.printException() return (None, e)
def parseContent(self, page): ret = {} soup = BS(page) try: li = soup.find('li', 'downloadlinkstatic') src = li.a.get('href') ret['contentMp3'] = src except Exception, e: util.printException() return (None, e)
def parseContent(self, page): ret = {'excelUrl': ''} try: soup = BS(page) td = soup.find('td', align="right", width="60px", valign="bottom") print td print td.a['href'] except Exception, e: util.printException() return (None, e)
def parseContent(self, page): ret = [] page = page.decode('utf-8') try: soup = BS(page) tableDiv = soup.find('div', class_='genTable') trList = tableDiv.find_all('tr') for tr in trList: ret.append(tr.text) except Exception, e: util.printException() return (None, e)
def parseContent(self, page): ret = [] try: soup = BS(page) headerDiv = soup.find(id='indexItems') headlis = headerDiv.select('li') for li in headlis: iUrl = urlparse.urljoin(li.a['href'].rstrip(".html")+"/", "pc10.html?tab=None") #iUrl = urlparse.urljoin(li.a['href'].rstrip(".html")+"/", "pc0.html?tab=None") ret.append(iUrl) except Exception, e: util.printException() return (None, e)
def parseContent(self, page): ret = {} try: soup = BS(page.decode('gbk')) table = soup.find('table', width="92%", cellspacing="1", cellpadding="2", border="0") trList = table.find_all('tr') for tr in trList: tdList = tr.find_all('td') if len(tdList) <= 1: continue k = tdList[0].text.strip().split(":")[0] v = tdList[1].text.strip() ret[k] = v except Exception, e: util.printException() return (None, e)
def parseContent(self, page): ret = [] try: soup = BS(page) headerDiv = soup.find(id='header') headlis = headerDiv.select('li.header_navigation_item.has_child') for li in headlis: if li.a.text == 'Audio': links = li.find_all('a', 'section_link') break for link in links: ret.append(link['href']) except Exception, e: util.printException() return (None, e)
def parseContent(self, page): ret = {'stockInfoPage':{}, 'stockList':[]} try: soup = BS(page) trList = soup.find_all('tr', class_ ='tr_normal' ) for tr in trList: tdList = tr.find_all('td') href = '' if hasattr(tdList[1], 'a') and tdList[1].a != None and tdList[1].a != '': href = tdList[1].a['href'] textList = [ td.text for td in tdList ] textList.append(href) ret['stockList'].append(textList) ret['stockInfoPage'][textList[0]] = href except Exception, e: util.printException() return (None, e)
def parseContent(self, page): ret = {"zipPic": {}, "contentPage": {}} try: soup = BS(page) articleUl = soup.find(id="articleItems") # ul = soup.find('ul', "bullet_orange") divs = articleUl.find_all("div", "media-block") for div in divs: zipPic = div.img.get("src") # has audio? a = div.find("a", "img-wrapper") if a and a["href"] != "": url = a.get("href") key = url.replace("/", "_") ret["zipPic"][key] = zipPic ret["contentPage"][key] = url except Exception, e: util.printException() return (None, e)
def parseContent(self, page): page = page.decode('gbk') ret = {'stockList':[], 'nextPage':''} try: soup = BS(page) theA = soup.find('a', text='下一页') if theA != None and theA['href'] != '': ret['nextPage'] = theA['href'] table = soup.find('table', width="100%", cellspacing="1", cellpadding="2", border="0", bgcolor="#337fb2") trList = table.find_all('tr') for tr in trList[1:]: tdList = tr.find_all('td') code = tdList[0].text name = tdList[1].text href = tdList[0].a['href'] ret['stockList'].append({'code': code, 'name': name, 'href': href}) except Exception, e: util.printException() return (None, e)
def extractSIFTMatches(filepath1, filepath2, params, paramsSIFT, properties, csvDir): # Skip if pointmatches CSV file exists already: csvpath = os.path.join( csvDir, basename(filepath1) + '.' + basename(filepath2) + ".pointmatches.csv") if os.path.exists(csvpath): return False try: # Load from CSV files or extract features de novo features1 = ensureSIFTFeatures(filepath1, paramsSIFT, properties, csvDir) features2 = ensureSIFTFeatures(filepath2, paramsSIFT, properties, csvDir) #syncPrintQ("Loaded %i features for %s\n %i features for %s" % (features1.size(), os.path.basename(filepath1), # features2.size(), os.path.basename(filepath2))) # Vector of PointMatch instances sourceMatches = FloatArray2DSIFT.createMatches( features1, features2, params.get( "max_sd", 1.5), # max_sd: maximal difference in size (ratio max/min) TranslationModel2D(), params.get( "max_id", Double.MAX_VALUE), # max_id: maximal distance in image space params.get("rod", 0.9)) # rod: ratio of best vs second best syncPrintQ("Found %i SIFT pointmatches for %s vs %s" % (sourceMatches.size(), os.path.basename(filepath1), os.path.basename(filepath2))) # Store pointmatches savePointMatches(os.path.basename(filepath1), os.path.basename(filepath2), sourceMatches, csvDir, params) return True except: printException()
def loadFeatures(img_filename, directory, params, validateOnly=False, epsilon=0.00001, verbose=True): """ Attempts to load features from filename + ".features.csv" if it exists, returning a list of Constellation features or None. params: dictionary of parameters with which features are wanted now, to compare with parameter with which features were extracted. In case of mismatch, return None. epsilon: allowed error when comparing floating-point values. validateOnly: if True, return after checking that parameters match. """ try: csvpath = os.path.join(directory, basename(img_filename) + ".features.csv") if os.path.exists(csvpath): with open(csvpath, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') # First line contains parameter names, second line their values if not checkParams(params, reader.next(), reader.next(), epsilon): return None if validateOnly: return True # would return None above, which is falsy reader.next() # skip header with column names features = [ Constellation.fromRow(map(float, row)) for row in reader ] if verbose: syncPrint("Loaded %i features for %s" % (len(features), img_filename)) return features else: if verbose: syncPrint("No stored features found at %s" % csvpath) return None except: syncPrint("Could not load features for %s" % img_filename) printException() return None
def parseContent(self, page): ret = {'zipPic':{}, 'contentPage':{}} try: soup = BS(page) titleDiv = soup.find(id='archive').h2 classTitle = titleDiv.text ul = soup.find('ul', "bullet_orange") divs = ul.find_all('div', 'archive_rowmm') for div in divs: zipPic = div.img.get('src') #has audio? a = div.h4.a if a.select('span.assignedIcon.asIcoAudio') != []: span = a.find('span', 'underlineLink') url = a.get('href') key = url.replace('/', '_') ret['zipPic'][key] = zipPic ret['contentPage'][key] = url except Exception, e: util.printException() return (None, e)
def parseContent(self, page): ret = {'zipPic': {}, 'contentPage': {}} try: soup = BS(page) titleDiv = soup.find(id='archive').h2 classTitle = titleDiv.text ul = soup.find('ul', "bullet_orange") divs = ul.find_all('div', 'archive_rowmm') for div in divs: zipPic = div.img.get('src') #has audio? a = div.h4.a if a.select('span.assignedIcon.asIcoAudio') != []: span = a.find('span', 'underlineLink') url = a.get('href') key = url.replace('/', '_') ret['zipPic'][key] = zipPic ret['contentPage'][key] = url except Exception, e: util.printException() return (None, e)
def loadPointMatches(img1_filename, img2_filename, directory, params, epsilon=0.00001, verbose=True): """ Attempts to load point matches from filename1 + '.' + filename2 + ".pointmatches.csv" if it exists, returning a list of PointMatch instances or None. params: dictionary of parameters with which pointmatches are wanted now, to compare with parameter with which pointmatches were made. In case of mismatch, return None. epsilon: allowed error when comparing floating-point values. """ try: csvpath = os.path.join( directory, basename(img1_filename) + '.' + basename(img2_filename) + ".pointmatches.csv") if not os.path.exists(csvpath): if verbose: syncPrint("No stored pointmatches found at %s" % csvpath) return None with open(csvpath, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') # First line contains parameter names, second line their values if not checkParams(params, reader.next(), reader.next(), epsilon): return None if next(reader, None) is None: # skip header with column names return [] # zero pointmatches pointmatches = PointMatches.fromRows(reader).pointmatches if verbose: syncPrint("Loaded %i pointmatches for %s, %s" % (len(pointmatches), img1_filename, img2_filename)) return pointmatches except: syncPrint("Could not load pointmatches for pair %s, %s" % (img1_filename, img2_filename)) printException() return None
def parseContent(self, page): ret = {'contentPics':[], 'contentPicCaptions':[], 'embPics':[]} try: soup = BS(page) articleDiv = soup.find('div', id='article') siteTitleH2 = articleDiv.find('h2', 'sitetitle') ret['siteTitle'] = siteTitleH2.a.text titleH1 = articleDiv.find('h1') ret['title'] = titleH1.text.strip() picDiv = soup.find('div', 'watermark') if picDiv and picDiv.parent['class'] == ['contentImage', 'floatNone']: ret['contentPics'].append(picDiv.a.img.get('src')) ret['contentPicCaptions'].append(picDiv.next_sibling.text) li = soup.find('li', 'downloadlinkstatic') if li != None: src = li.a.get('href') ret['contentMp3'] = src else: li = soup.find('li', 'listenlink') url = li.a.get('href') ret['contentMp3Page'] = url contentDiv = articleDiv.find('div', 'articleContent') dateDiv = contentDiv.find('div', 'dateblock') date = dateDiv.text.strip() ret['date'] = date contentZoomMeDiv = contentDiv.find('div', 'zoomMe') #delete mp3 player part mp3H5 = contentZoomMeDiv.find('h5', 'tagaudiotitle') if mp3H5: print mp3H5 div = mp3H5.find_next_sibling('div', 'mediaplayer audioplayer') div.decompose() mp3H5.decompose() #delete script for ele in contentZoomMeDiv.find_all('script'): ele.decompose() for ul in contentZoomMeDiv.find_all('ul'): if ul.find('li', 'playlistlink') or \ ul.find('li', 'listenlink'): ul.decompose() #print contentZoomMeDiv #delete until first p if contentZoomMeDiv.find('div', 'wordclick'): iterContent = contentZoomMeDiv.find('div', 'wordclick') else: iterContent = contentZoomMeDiv for tag in iterContent.find_all(): if tag.name != None: if tag.name != 'p' and tag.name != 'br': tag.decompose() else: break keepDelete = False for tag in iterContent.find_all(): if tag.name != None: if keepDelete == False: if tag.name == 'div': tagClass = tag.get('class') if tag.find('p') and self.reXHX.search(tag.p.text): oriTag = tag tag = tag.p if tag.find_next_sibling('h5', 'tagaudiotitle') or \ tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']): keepDelete = True tag.decompose() #if tag.find('span') and self.reXHX.search(tag.span.text): # print tag # #keepDelete = True # #tag.span.decompose() #elif tag.find('em') and self.reXHX.search(tag.em.text): # print tag # #keepDelete = True # #tag.em.decompose() #elif self.reXHX.search(tag.text): # print tag # #keepDelete = True # #tag.decompose() elif tagClass: if 'infgraphicsAttach' in tagClass: tag.decompose() if 'boxwidget' in tagClass: #boxwidget w_Quiz2c w_QuizInside tag.decompose() elif tag.name == 'iframe': tag.decompose() elif tag.name == 'p': #check em if tag.find('span') and self.reXHX.search(tag.span.text): #print tag #keepDelete = True #tag.span.decompose() if tag.find_next_sibling('h5', 'tagaudiotitle') or \ tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']): keepDelete = True tag.decompose() elif tag.find('em') and self.reXHX.search(tag.em.text): #print tag #keepDelete = True #tag.em.decompose() if tag.find_next_sibling('h5', 'tagaudiotitle') or \ tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']): keepDelete = True tag.decompose() elif self.reXHX.search(tag.text): print tag #keepDelete = True #tag.decompose() if tag.find_next_sibling('h5', 'tagaudiotitle') or \ tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']): keepDelete = True tag.decompose() else: tag.decompose() #print contentZoomMeDiv #filt photos in content for tag in iterContent.find_all(): if tag.name == 'div' and tag.get('class'): if 'embedded_content_object' in tag.get('class'): embDiv = tag embImgDiv = embDiv.find('div', 'contentImage') if embImgDiv: embImg = embImgDiv.find('img') src = embImg.get('src') ret['embPics'].append(src) newSrc = os.path.basename(urlparse.urlparse(src).path) embImg['src'] = newSrc tag.replace_with(embImgDiv) else: tag.decompose() ret['content'] = "%s" % contentZoomMeDiv.prettify().encode('utf-8') except Exception, e: util.printException() return (None, e)
def parseContent(self, page): ret = {'contentPics': [], 'contentPicCaptions': [], 'embPics': []} try: soup = BS(page) articleDiv = soup.find('div', id='article') siteTitleH2 = articleDiv.find('h2', 'sitetitle') ret['siteTitle'] = siteTitleH2.a.text titleH1 = articleDiv.find('h1') ret['title'] = titleH1.text.strip() picDiv = soup.find('div', 'watermark') if picDiv and picDiv.parent['class'] == [ 'contentImage', 'floatNone' ]: ret['contentPics'].append(picDiv.a.img.get('src')) ret['contentPicCaptions'].append(picDiv.next_sibling.text) li = soup.find('li', 'downloadlinkstatic') if li != None: src = li.a.get('href') ret['contentMp3'] = src else: li = soup.find('li', 'listenlink') url = li.a.get('href') ret['contentMp3Page'] = url contentDiv = articleDiv.find('div', 'articleContent') dateDiv = contentDiv.find('div', 'dateblock') date = dateDiv.text.strip() ret['date'] = date contentZoomMeDiv = contentDiv.find('div', 'zoomMe') #delete mp3 player part mp3H5 = contentZoomMeDiv.find('h5', 'tagaudiotitle') if mp3H5: print mp3H5 div = mp3H5.find_next_sibling('div', 'mediaplayer audioplayer') div.decompose() mp3H5.decompose() #delete script for ele in contentZoomMeDiv.find_all('script'): ele.decompose() for ul in contentZoomMeDiv.find_all('ul'): if ul.find('li', 'playlistlink') or \ ul.find('li', 'listenlink'): ul.decompose() #print contentZoomMeDiv #delete until first p if contentZoomMeDiv.find('div', 'wordclick'): iterContent = contentZoomMeDiv.find('div', 'wordclick') else: iterContent = contentZoomMeDiv for tag in iterContent.find_all(): if tag.name != None: if tag.name != 'p' and tag.name != 'br': tag.decompose() else: break keepDelete = False for tag in iterContent.find_all(): if tag.name != None: if keepDelete == False: if tag.name == 'div': tagClass = tag.get('class') if tag.find('p') and self.reXHX.search(tag.p.text): oriTag = tag tag = tag.p if tag.find_next_sibling('h5', 'tagaudiotitle') or \ tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']): keepDelete = True tag.decompose() #if tag.find('span') and self.reXHX.search(tag.span.text): # print tag # #keepDelete = True # #tag.span.decompose() #elif tag.find('em') and self.reXHX.search(tag.em.text): # print tag # #keepDelete = True # #tag.em.decompose() #elif self.reXHX.search(tag.text): # print tag # #keepDelete = True # #tag.decompose() elif tagClass: if 'infgraphicsAttach' in tagClass: tag.decompose() if 'boxwidget' in tagClass: #boxwidget w_Quiz2c w_QuizInside tag.decompose() elif tag.name == 'iframe': tag.decompose() elif tag.name == 'p': #check em if tag.find('span') and self.reXHX.search( tag.span.text): #print tag #keepDelete = True #tag.span.decompose() if tag.find_next_sibling('h5', 'tagaudiotitle') or \ tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']): keepDelete = True tag.decompose() elif tag.find('em') and self.reXHX.search( tag.em.text): #print tag #keepDelete = True #tag.em.decompose() if tag.find_next_sibling('h5', 'tagaudiotitle') or \ tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']): keepDelete = True tag.decompose() elif self.reXHX.search(tag.text): print tag #keepDelete = True #tag.decompose() if tag.find_next_sibling('h5', 'tagaudiotitle') or \ tag.find_next_sibling('div', ['mediaplayer', 'audioplayer']): keepDelete = True tag.decompose() else: tag.decompose() #print contentZoomMeDiv #filt photos in content for tag in iterContent.find_all(): if tag.name == 'div' and tag.get('class'): if 'embedded_content_object' in tag.get('class'): embDiv = tag embImgDiv = embDiv.find('div', 'contentImage') if embImgDiv: embImg = embImgDiv.find('img') src = embImg.get('src') ret['embPics'].append(src) newSrc = os.path.basename( urlparse.urlparse(src).path) embImg['src'] = newSrc tag.replace_with(embImgDiv) else: tag.decompose() ret['content'] = "%s" % contentZoomMeDiv.prettify().encode('utf-8') except Exception, e: util.printException() return (None, e)
def parseContent(self, page): page = page.decode('gbk') ret = {'stockBasicInfo':{}, 'stockOtherInfo':{}} transMapCn2En = { u'公司代码': 'compCode', u'注册地址': 'registerAddr', u'法定代表人': 'legalRepresentative', u'董事会秘书姓名': 'boardSecretary', u'E-mail': 'E-mail', u'联系电话': 'phone', u'网址':'website', u'SSE行业': 'sseSector', u'是否上证180样本股': 'isSh180', u'是否境外上市': 'isOverseasListing', u'境外上市地': 'OverseasListingPlace' } try: soup = BS(page) titleSpan = soup.find('span', class_ ='pagetitle' ) br = titleSpan.find('br') cmpNameAndCode = br.previous_element cmpName, code = re.split("\s+", cmpNameAndCode.strip()) table = soup.find('table', width="100%", cellspacing="5", cellpadding="0", border="0") contentTd = table.find('td', class_="content", width="100%", valign="top") #contentTableList = contentTd.find_all('table', class="content", width="100%", bgcolor="#FFFFFF", align="center") tdList = contentTd.find_all('td', class_="content_b") for td in tdList: key = re.sub(r"\s+", "", td.text.strip().strip(':')) value = td.find_next_sibling('td').text.strip() if key == u'股票代码(A股/B股)': valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['code'] = valueList[0] if len(valueList) == 1 or valueList[1] == '-': ret['stockBasicInfo']['B_code'] = '' else: ret['stockBasicInfo']['B_code'] = valueList[1] elif key == u'上市日(A股/B股)': valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['A_IPO_date'] = valueList[0] if len(valueList) == 1 or valueList[1] == '-': ret['stockBasicInfo']['B_IPO_date'] = '' else: ret['stockBasicInfo']['B_IPO_date'] = valueList[1] elif key == u'可转债简称(代码)': valueListMatch = re.search(ur"(\.+)\s*((.+))", value) if valueListMatch != None: groups = valueListMatch.groups() ret['stockBasicInfo']['convertibleBondAbbr'] = groups[0] ret['stockBasicInfo']['convertibleBondCode'] = groups[1] else: ret['stockBasicInfo']['convertibleBondAbbr'] = '' ret['stockBasicInfo']['convertibleBondCode'] = '' elif key == u'公司简称(中/英)': valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['compNameAbbr'] = valueList[0] if len(valueList) == 1 or valueList[1] == '-': ret['stockBasicInfo']['compNameAbbrEn'] = '' else: ret['stockBasicInfo']['compNameAbbrEn'] = valueList[1] elif key == u'公司全称(中/英)': valueList = value.split("\n") ret['stockBasicInfo']['compNameCn'] = valueList[0] ret['stockBasicInfo']['compNameEn'] = valueList[1] elif key == u'通讯地址(邮编)': value = value.replace("\n", "") valueListMatch = re.search(ur"(\.+)\s*((.+))", value) if valueListMatch != None: groups = valueListMatch.groups() ret['stockBasicInfo']['contactAddr'] = groups[0] ret['stockBasicInfo']['postcode'] = groups[1] else: ret['stockBasicInfo']['contactAddr'] = '' ret['stockBasicInfo']['postcode'] = '' elif key.startswith(u'CSRC行业'): valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['sector'] = "/".join(valueList) ret['stockBasicInfo']['sector1'] = valueList[0] ret['stockBasicInfo']['sector2'] = valueList[1] if valueList[2] == '-': ret['stockBasicInfo']['sector3'] = valueList[1] else: ret['stockBasicInfo']['sector3'] = valueList[2] elif key == u'所属省/直辖市': valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['province'] = valueList[0] if len(valueList) == 1 or valueList[1] == '-': ret['stockBasicInfo']['city'] = '' else: ret['stockBasicInfo']['city'] = valueList[1] elif key == u'A股状态/B股状态': valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['A_status'] = valueList[0] if len(valueList) == 1 or valueList[1] == '-': ret['stockBasicInfo']['B_status'] = '' else: ret['stockBasicInfo']['B_status'] = valueList[1] else: if key in transMapCn2En: ret['stockBasicInfo'][transMapCn2En[key]] = value.strip() else: ret['stockBasicInfo'][key] = value.strip() except Exception, e: util.printException() return (None, e)
def extractBlockMatches(filepath1, filepath2, params, paramsSIFT, properties, csvDir, exeload, load): """ filepath1: the file path to an image of a section. filepath2: the file path to an image of another section. params: dictionary of parameters necessary for BlockMatching. exeload: an ExecutorService for parallel loading of image files. load: a function that knows how to load the image from the filepath. return False if the CSV file already exists, True if it has to be computed. """ # Skip if pointmatches CSV file exists already: csvpath = os.path.join( csvDir, basename(filepath1) + '.' + basename(filepath2) + ".pointmatches.csv") if os.path.exists(csvpath): return False try: # Load files in parallel futures = [ exeload.submit(Task(load, filepath1)), exeload.submit(Task(load, filepath2)) ] fp1 = futures[0].get( ) # FloatProcessor, already Gaussian-blurred, contrast-corrected and scaled! fp2 = futures[1].get() # FloatProcessor, idem # Define points from the mesh sourcePoints = ArrayList() # List to fill sourceMatches = ArrayList( ) # of PointMatch from filepath1 to filepath2 # Don't use blockmatching if the dimensions are different use_blockmatching = fp1.getWidth() == fp2.getWidth() and fp1.getHeight( ) == fp2.getHeight() # Fill the sourcePoints mesh = TransformMesh(params["meshResolution"], fp1.width, fp1.height) PointMatch.sourcePoints(mesh.getVA().keySet(), sourcePoints) syncPrintQ("Extracting block matches for \n S: " + filepath1 + "\n T: " + filepath2 + "\n with " + str(sourcePoints.size()) + " mesh sourcePoints.") # Run BlockMatching.matchByMaximalPMCCFromPreScaledImages( fp1, fp2, params["scale"], # float params["blockRadius"], # X params["blockRadius"], # Y params["searchRadius"], # X params["searchRadius"], # Y params["minR"], # float params["rod"], # float params["maxCurvature"], # float sourcePoints, sourceMatches) # At least some should match to accept the translation if len(sourceMatches) < max(20, len(sourcePoints) / 5) / 2: syncPrintQ( "Found only %i blockmatching pointmatches (from %i source points)" % (len(sourceMatches), len(sourcePoints))) syncPrintQ( "... therefore invoking SIFT pointmatching for:\n S: " + basename(filepath1) + "\n T: " + basename(filepath2)) # Can fail if there is a shift larger than the searchRadius # Try SIFT features, which are location independent # # Images are now scaled: load originals futures = [ exeload.submit( Task(loadFloatProcessor, filepath1, params, paramsSIFT, scale=False)), exeload.submit( Task(loadFloatProcessor, filepath2, params, paramsSIFT, scale=False)) ] fp1 = futures[0].get() # FloatProcessor, original fp2 = futures[1].get() # FloatProcessor, original # Images can be of different size: scale them the same way area1 = fp1.width * fp1.height area2 = fp2.width * fp2.height if area1 == area2: paramsSIFT1 = paramsSIFT.clone() paramsSIFT1.maxOctaveSize = int( max(properties.get("SIFT_max_size", 2048), fp1.width * params["scale"])) paramsSIFT1.minOctaveSize = int(paramsSIFT1.maxOctaveSize / pow(2, paramsSIFT1.steps)) paramsSIFT2 = paramsSIFT1 else: bigger, smaller = (fp1, fp2) if area1 > area2 else (fp2, fp1) target_width_bigger = int( max(1024, bigger.width * params["scale"])) if 1024 == target_width_bigger: target_width_smaller = int(1024 * float(smaller.width) / bigger.width) else: target_width_smaller = smaller.width * params["scale"] # paramsSIFT1 = paramsSIFT.clone() paramsSIFT1.maxOctaveSize = target_width_bigger paramsSIFT1.minOctaveSize = int(paramsSIFT1.maxOctaveSize / pow(2, paramsSIFT1.steps)) paramsSIFT2 = paramsSIFT.clone() paramsSIFT2.maxOctaveSize = target_width_smaller paramsSIFT2.minOctaveSize = int(paramsSIFT2.maxOctaveSize / pow(2, paramsSIFT2.steps)) ijSIFT1 = SIFT(FloatArray2DSIFT(paramsSIFT1)) features1 = ArrayList() # of Point instances ijSIFT1.extractFeatures(fp1, features1) ijSIFT2 = SIFT(FloatArray2DSIFT(paramsSIFT2)) features2 = ArrayList() # of Point instances ijSIFT2.extractFeatures(fp2, features2) # Vector of PointMatch instances sourceMatches = FloatArray2DSIFT.createMatches( features1, features2, params.get( "max_sd", 1.5), # max_sd: maximal difference in size (ratio max/min) TranslationModel2D(), params.get("max_id", Double.MAX_VALUE ), # max_id: maximal distance in image space params.get("rod", 0.9)) # rod: ratio of best vs second best # Store pointmatches savePointMatches(os.path.basename(filepath1), os.path.basename(filepath2), sourceMatches, csvDir, params) return True except: printException()
def ensurePointMatches(filepaths, csvDir, params, paramsSIFT, n_adjacent, properties): """ If a pointmatches csv file doesn't exist, will create it. """ w = ParallelTasks("ensurePointMatches", exe=newFixedThreadPool(properties["n_threads"])) exeload = newFixedThreadPool() try: if properties.get("use_SIFT", False): syncPrintQ("use_SIFT is True") # Pre-extract SIFT features for all images first # ensureSIFTFeatures returns the features list so the Future will hold it in memory: can't hold onto them # therefore consume the tasks in chunks: chunk_size = properties["n_threads"] * 2 count = 1 for result in w.chunkConsume( chunk_size, # tasks to submit before starting to wait for futures (Task(ensureSIFTFeatures, filepath, paramsSIFT, properties, csvDir, validateByFileExists=properties.get( "SIFT_validateByFileExists")) for filepath in filepaths)): count += 1 if 0 == count % chunk_size: syncPrintQ( "Completed extracting or validating SIFT features for %i images." % count) w.awaitAll() syncPrintQ( "Completed extracting or validating SIFT features for all images." ) # Compute pointmatches across adjacent sections count = 1 for result in w.chunkConsume( chunk_size, generateSIFTMatches(filepaths, n_adjacent, params, paramsSIFT, properties, csvDir)): count += 1 syncPrintQ("Completed SIFT pointmatches %i/%i" % (count, len(filepaths) * n_adjacent)) else: # Use blockmatches syncPrintQ("using blockmatches") loadFPMem = SoftMemoize(lambda path: loadFloatProcessor( path, params, paramsSIFT, scale=True), maxsize=properties["n_threads"] + n_adjacent) count = 1 for result in w.chunkConsume( properties["n_threads"], pointmatchingTasks(filepaths, csvDir, params, paramsSIFT, n_adjacent, exeload, properties, loadFPMem)): if result: # is False when CSV file already exists syncPrintQ("Completed %i/%i" % (count, len(filepaths) * n_adjacent)) count += 1 syncPrintQ("Awaiting all remaining pointmatching tasks to finish.") w.awaitAll() syncPrintQ("Finished all pointmatching tasks.") except: printException() finally: exeload.shutdown() w.destroy()
def parseContent(self, page): page = page.decode('gbk') ret = {'stockBasicInfo': {}, 'stockOtherInfo': {}} transMapCn2En = { u'公司代码': 'compCode', u'注册地址': 'registerAddr', u'法定代表人': 'legalRepresentative', u'董事会秘书姓名': 'boardSecretary', u'E-mail': 'E-mail', u'联系电话': 'phone', u'网址': 'website', u'SSE行业': 'sseSector', u'是否上证180样本股': 'isSh180', u'是否境外上市': 'isOverseasListing', u'境外上市地': 'OverseasListingPlace' } try: soup = BS(page) titleSpan = soup.find('span', class_='pagetitle') br = titleSpan.find('br') cmpNameAndCode = br.previous_element cmpName, code = re.split("\s+", cmpNameAndCode.strip()) table = soup.find('table', width="100%", cellspacing="5", cellpadding="0", border="0") contentTd = table.find('td', class_="content", width="100%", valign="top") #contentTableList = contentTd.find_all('table', class="content", width="100%", bgcolor="#FFFFFF", align="center") tdList = contentTd.find_all('td', class_="content_b") for td in tdList: key = re.sub(r"\s+", "", td.text.strip().strip(':')) value = td.find_next_sibling('td').text.strip() if key == u'股票代码(A股/B股)': valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['code'] = valueList[0] if len(valueList) == 1 or valueList[1] == '-': ret['stockBasicInfo']['B_code'] = '' else: ret['stockBasicInfo']['B_code'] = valueList[1] elif key == u'上市日(A股/B股)': valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['A_IPO_date'] = valueList[0] if len(valueList) == 1 or valueList[1] == '-': ret['stockBasicInfo']['B_IPO_date'] = '' else: ret['stockBasicInfo']['B_IPO_date'] = valueList[1] elif key == u'可转债简称(代码)': valueListMatch = re.search(ur"(\.+)\s*((.+))", value) if valueListMatch != None: groups = valueListMatch.groups() ret['stockBasicInfo']['convertibleBondAbbr'] = groups[ 0] ret['stockBasicInfo']['convertibleBondCode'] = groups[ 1] else: ret['stockBasicInfo']['convertibleBondAbbr'] = '' ret['stockBasicInfo']['convertibleBondCode'] = '' elif key == u'公司简称(中/英)': valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['compNameAbbr'] = valueList[0] if len(valueList) == 1 or valueList[1] == '-': ret['stockBasicInfo']['compNameAbbrEn'] = '' else: ret['stockBasicInfo']['compNameAbbrEn'] = valueList[1] elif key == u'公司全称(中/英)': valueList = value.split("\n") ret['stockBasicInfo']['compNameCn'] = valueList[0] ret['stockBasicInfo']['compNameEn'] = valueList[1] elif key == u'通讯地址(邮编)': value = value.replace("\n", "") valueListMatch = re.search(ur"(\.+)\s*((.+))", value) if valueListMatch != None: groups = valueListMatch.groups() ret['stockBasicInfo']['contactAddr'] = groups[0] ret['stockBasicInfo']['postcode'] = groups[1] else: ret['stockBasicInfo']['contactAddr'] = '' ret['stockBasicInfo']['postcode'] = '' elif key.startswith(u'CSRC行业'): valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['sector'] = "/".join(valueList) ret['stockBasicInfo']['sector1'] = valueList[0] ret['stockBasicInfo']['sector2'] = valueList[1] if valueList[2] == '-': ret['stockBasicInfo']['sector3'] = valueList[1] else: ret['stockBasicInfo']['sector3'] = valueList[2] elif key == u'所属省/直辖市': valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['province'] = valueList[0] if len(valueList) == 1 or valueList[1] == '-': ret['stockBasicInfo']['city'] = '' else: ret['stockBasicInfo']['city'] = valueList[1] elif key == u'A股状态/B股状态': valueList = re.split(r"\s*/\s*", value) ret['stockBasicInfo']['A_status'] = valueList[0] if len(valueList) == 1 or valueList[1] == '-': ret['stockBasicInfo']['B_status'] = '' else: ret['stockBasicInfo']['B_status'] = valueList[1] else: if key in transMapCn2En: ret['stockBasicInfo'][ transMapCn2En[key]] = value.strip() else: ret['stockBasicInfo'][key] = value.strip() except Exception, e: util.printException() return (None, e)
def savePointMatches(img_filename1, img_filename2, pointmatches, directory, params, coords_header=["x1", "y1", "x2", "y2"]): filename = basename(img_filename1) + '.' + basename( img_filename2) + ".pointmatches.csv" path = os.path.join(directory, filename) msg = [str(len(pointmatches))] ra = None try: """ with open(path, 'w') as csvfile: w = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) # First two rows: parameter names and values keys = params.keys() msg.append("keys: " + ",".join(map(str, keys))) msg.append("vals: " + ",".join(str(params[key]) for key in keys)) #for pm in pointmatches: # msg.append(", ".join(map(str, PointMatches.asRow(pm)))) w.writerow(keys) w.writerow(tuple(params[key] for key in keys)) # PointMatches header if 0 == len(pointmatches): # Can't know whether there are 2 or 3 dimensions per coordinate w.writerow(coords_header) else: w.writerow(PointMatches.csvHeader(next(iter(pointmatches)))) # support both lists and sets # One PointMatch per row for pm in pointmatches: w.writerow(PointMatches.asRow(pm)) # Ensure it's written csvfile.flush() os.fsync(csvfile.fileno()) """ # DEBUG write differently, the above FAILS for ~20 out of 130,000 files lines = [] keys = params.keys() lines.append(",".join(map(str, keys))) lines.append(",".join(map(str, (params[key] for key in keys)))) header = coords_header if 0 == len(pointmatches) \ else PointMatches.csvHeader(next(iter(pointmatches))) lines.append(",".join(header)) for pm in pointmatches: p1 = pm.getP1().getW() # a double[] array p2 = pm.getP2().getW() # a double[] array lines.append("%f,%f,%f,%f" % (p1[0], p1[1], p2[0], p2[1])) body = "\n".join(lines) ra = RandomAccessFile(path, 'rw') ra.writeBytes(body) ra.getFD().sync() # ensure it's written except: syncPrintQ("Failed to save pointmatches at %s\n%s" % (path, "\n".join(msg))) printException() if os.path.exists(path): os.remove(path) finally: if ra is not None: ra.close()
def parseContent(self, page): transMapCn2En = { \ u'公司代码': 'compCode', u'公司简称': 'compNameAbbr', u'公司全称': 'compNameCn', u'英文名称': 'compNameEn', u'注册地址': 'regAddr', u'A股代码' : 'code', u'A股简称' : 'nameCn', u'A股上市日期': 'A_IPO_date', u'A股总股本': 'A_totalShares', u'A股流通股本':'A_circulationStock', u'B股代码': 'B_code', u'B股简称': 'B_nameCn', u'B股上市日期': 'B_IPO_date', u'B股总股本': 'B_totalShares', u'B股流通股本': 'B_circulationStock', u'地区': 'region', u'省份': 'province', u'城市': 'city', u'所属行业': 'sector', u'公司网址': 'webSite' } transMapEn2Cn = {} for k in transMapCn2En: transMapEn2Cn[transMapCn2En[k]] = k ret = {'stockList':[], 'headerList':[]} try: reEmpty = re.compile(r'\s+') soup = BS(page) table = soup.find('table') trList = table.find_all('tr') trHeader = trList[0] thList = trHeader.find_all('td') headerList = [ reEmpty.sub('', td.text.strip()) for td in thList ] ret['headerList'] = headerList for tr in trList[1:]: # for each stock tdList = tr.find_all('td') tdTextList = [ td.text.strip() for td in tdList ] stockDict = {} for i in range(len(tdTextList)): enKey = transMapCn2En[headerList[i]] value = tdTextList[i] if enKey in ('A_totalShares', 'A_circulationStock', 'B_totalShares', 'B_circulationStock'):# pull out the ',' value = value.replace(",", "") stockDict[enKey] = tdTextList[i] #J 金融业 if 'sector' in stockDict: theSector = stockDict['sector'] stockDict['sectorCode'] = theSector[0] stockDict['sectorName'] = theSector[1:] ret['stockList'].append(stockDict) except Exception, e: util.printException() return (None, e)