def setIndex(): from MyHTMLParser import MyHTMLParser import os HTMLlist = [] nobody = [ "index.html", "index1.html", "index2.html", "index3.html", "index4.html", "ru3.html", "ru2.html", "ru1.html", "rut.html", "rus.html", "ru.html" ] for (parent, d, f) in os.walk(NorroenDyrd.mirror): for fn in f: if fn in nobody: continue elif fn.find(".html") == -1: continue elif os.path.join(parent, fn) in HTMLlist: continue else: HTMLlist.append(os.path.join(parent, fn)) html = [] for h in HTMLlist: entry = {} with open(h, "r", encoding="utf-8") as f: html = f.readlines() parser = MyHTMLParser() for i in html: parser.feed(i) entry["path"] = h.replace(NorroenDyrd.mirror, NorroenDyrd.base) entry["text"] = parser.plaintext entry["title"] = parser.title NorroenDyrd.index.append(entry) del parser
def __GetEmailContent__(self, filePath): self._myHtmlParserObj = MyHTMLParser() emailContent = "" with open(filePath, 'r') as handle: emailMessage = email.message_from_file(handle) emailBody = "" if emailMessage.is_multipart(): for part in emailMessage.walk(): if part.get_content_type( ) == "text/html" or part.get_content_type( ) == "text/plain": partPayload = part.get_payload() emailBody = emailBody + ' ' + partPayload else: if emailMessage.get_content_type( ) == "text/html" or emailMessage.get_content_type( ) == "text/plain": emailBody = emailMessage.get_payload() # Cleaning email content emailSubject = '' if emailMessage.has_key('subject'): emailSubject = self.__CleanEmailContent__( emailMessage['subject']) emailContent = self._myHtmlParserObj.GetParsedContentFromHtml( emailBody) emailContent = str(emailSubject) + " " + str(emailContent) emailContent = self.__CleanEmailContent__(emailContent) return emailContent
class Inverter(): #global doc_id, term_count def __init__(self, config): self.config = config self.ht = myhashtable(config) self.htmlparser = MyHTMLParser(self.config, self.ht) self.start_batch_processing() self.write_file_map() self.ht.write_posting_file(term_count) self.ht.write_hash_table() def start_batch_processing(self): file_id = 0 for in_file in os.listdir(self.config['str_src_dir']): #if in_file not in ['medium.html','simple.html']: continue #for testing with open(self.config['str_src_dir'] + in_file, 'r') as f: doc_id[file_id] = in_file term_count[file_id] = 0 self.htmlparser.feed(f.read(), file_id) file_id += 1 # writing doc_id <--> doc_name file def write_file_map(self): #writing document id file with open( self.config['str_dst_dir'] + self.config['str_doc_id_file_name'], 'wb+') as f: for did, txt in doc_id.iteritems(): f.write('{0:0>{1}d} {2:' '<{3}s}\n'.format(did, self.config['file_id_encoding_len'], txt, self.config['file_name_len']))
def f(idx, q,r): path = "data%s"%(idx) os.makedirs(path) while True: item = q.get() if( item.item_type == ITEM_QUIT ): break; count = 0 localQueue = Queue() current = item.data while True: print current fo = urlopen(current) data = fo.read() name = "%s/%s"%(path,count) fw = open( name, "w" ) count = count + 1 fw.write(data) fw.close() fo.close() p = MyHTMLParser() try: p.feed(data) except: pass for href in p.hrefs: print item.data, ": ", href try: current = localQueue.get_nowait() except: break;
def __init__(self, config): self.config = config self.ht = myhashtable(config) self.htmlparser = MyHTMLParser(self.config, self.ht) self.start_batch_processing() self.write_file_map() self.ht.write_posting_file(term_count) self.ht.write_hash_table()
def create_journey_instructions(steps): parser = MyHTMLParser() # HTML parser for directions API data instruct = "" for step in steps: parser.feed(step['html_instructions']) instruct += parser.get_data() + ">>>>>" print(instruct) return instruct
def table_maker(pd_row): #read data from html-like file h = MyHTMLParser() h.feed(pd_row['data']) soup = soupparser() p = soup.handle_data(pd_row) dividendpershare = [h.type_dividendpershare, h.asofdate_dividendpershare, h.reporttype_dividendpershare, h.period_dividendpershare, h.currency_dividendpershare, p.data_dividendpershare, h.exdate_dividendpershare, h.recorddate_dividendpershare, h.paydate_dividendpershare, h.declarationdate_dividendpershare] totalrevenue = [h.type_totalrevenue, h.asofdate_totalrevenue, h.reporttype_totalrevenue, h.period_totalrevenue, h.currency_totalrevenue, p.data_totalrevenue, h.exdate_totalrevenue, h.recorddate_totalrevenue, h.paydate_totalrevenue, h.declarationdate_totalrevenue] dividend = [h.type_dividend, h.asofdate_dividend, h.reporttype_dividend, h.period_dividend, h.currency_dividend, p.data_dividend, h.exdate_dividend, h.recorddate_dividend, h.paydate_dividend, h.declarationdate_dividend] eps = [h.type_eps, h.asofdate_eps, h.reporttype_eps, h.period_eps, h.currency_eps, p.data_eps, h.exdate_eps, h.recorddate_eps, h.paydate_eps, h.declarationdate_eps] #sort data and make it into a dataframe names = ['type', 'asofdate', 'reporttype', 'period', 'currency','data', 'exdate', 'recorddate', 'paydate', 'declarationdate'] def make_dataframe(list1): dict1 = {names[i]: list1[i] for i in range(10)} dataframe1 = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in dict1.items()])) dataframe1 = dataframe1.fillna(method='ffill') return dataframe1 dividendpershare_dataframe = make_dataframe(dividendpershare) totalrevenue_dataframe = make_dataframe(totalrevenue) dividend_dataframe = make_dataframe(dividend) eps_dataframe = make_dataframe(eps) table1 = pd.concat([dividendpershare_dataframe, totalrevenue_dataframe, dividend_dataframe, eps_dataframe], axis = 0, ignore_index=True) reqId1 = [pd_row['reqId']] * len(table1['type']) table1['reqId'] = pd.Series(np.array(reqId1), index = table1.index) #format each column to put into sql table1['type'] = table1['type'].astype(str) table1['reporttype'] = table1['reporttype'].astype(str) table1['period'] = table1['period'].astype(str) table1['asofdate'] = pd.to_datetime(table1['asofdate']) table1['exdate'] = pd.to_datetime(table1['exdate']) table1['recorddate'] = pd.to_datetime(table1['recorddate']) table1['paydate'] = pd.to_datetime(table1['paydate']) table1['declarationdate'] = pd.to_datetime(table1['declarationdate']) #drop_duplicate line table1 = table1.drop_duplicates() return table1
def downloadPictures(): curl = MyCurl() curl.set_url(MyUriEncode.getUrl()) buffer = BytesIO() buffer = curl.set_buffer(buffer) curl.exec() curl.close() body = buffer.getvalue().decode('utf-8') parser = MyHTMLParser() parser.feed(body)
def getListOfPokemonPages(): pokemonListAddress = "http://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number" pokeListResponse = urllib.request.urlopen(pokemonListAddress) pokeListPage = str(pokeListResponse.read()) parser = MyHTMLParser() parser.feed(pokeListPage) baseBulbapediaAdress = "http://bulbapedia.bulbagarden.net" for i, link in enumerate(parser.pokeListParser.pokemonURLs): parser.pokeListParser.pokemonURLs[i] = baseBulbapediaAdress + link return parser.pokeListParser.pokemonURLs
def dataFetcher(self, jobname=None): global masterList global masterDictionary global jobName global prePost # MEATHOD OVERLOAING if jobname == None: mydatafetcher = URLCreator(jobname=self.jobName) else: mydatafetcher = URLCreator(jobname=jobname) contents = mydatafetcher.loadUrl() parser = MyHTMLParser() root = parser.feed(contents) postdependencyList = list(set(parser.postList)) predependencyList = list(set(parser.preList)) # print('pre:',predependencyList) return predependencyList, postdependencyList
def __init__(self, emailDirPath, spamMappingFilePath): self._emailDirPath = emailDirPath self._spamIdentifierMapFilePath = spamMappingFilePath # {'inmail.1' :'Spam', 'inmail.10':'Ham',...........'inmail.200':'Spam'} self._emailFileNameToSpamOrHamMap = {} self._emailFileNameToSpamOrHamMap = self.__LoadFileNameToSpamOrHamMapping__( self._spamIdentifierMapFilePath) self._myHtmlParserObj = MyHTMLParser() self.fp = "test.txt" if os.path.isfile(self.fp): os.remove(self.fp) with open(self.fp, 'w') as handle: handle.write(str(datetime.datetime.now())) self.bulkList = [] self._ESMgrObject = ElasticSearchManager(Resource.INDEX_NAME, Resource.TYPE_NAME)
def crawl(self, depth, frontier): if depth > self.maxdepth: return nextLevelFrontier = list() for url in frontier: # only parse when the number of crawled pages are not exceeding maximum if len(self.crawledlist ) < self.numPages and url not in self.crawledlist: # pass in the URL and create the request request = req.Request( url, headers={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" }) try: time.sleep(1) # send the request to the url and get the response data = req.urlopen(request).read().decode("utf-8") parser = MyHTMLParser() parser.feed(data) # Handling Nonetype if self.record(url, depth): self.create_web_file(data, len(self.crawledlist)) print(url) print("Finished:", len(self.crawledlist), "files") print("current depth: ", depth) nextLevelFrontier += parser.urls # try to catch errors when encounter except urllib.error.HTTPError as err: # handling page not found error if err.code == 404: continue else: raise self.crawl(depth + 1, nextLevelFrontier)
def getImageTagFromHtml(file): MyHTMLParser.feed(file) return MyHTMLParser.map
# grab links from html stdin input and canonicalize them # spit them out on newlines for processing from MyHTMLParser import MyHTMLParser import fileinput import sys import argparse argparser = argparse.ArgumentParser(description='Parse an html dump of an IA page for matching links') #argparser.add_argument('--path', dest='path', # default='/', # help='path to match in links') args = argparser.parse_args() parser = MyHTMLParser() html = sys.stdin.read() parser.feed(html) for link in parser.get_details(): print link
fname,fext=os.path.splitext(file) if (fext in s for s in filetype): filelist.append(os.path.join(rootpath,file)) return filelist def getImageTagFromHtml(file): MyHTMLParser.feed(file) return MyHTMLParser.map def getImageAltTextByImageName(image): return if __name__=="__main__": files=getAllHtmlFiles(rootpath,filetype) for file in files: fileobject=open(file,'r') data=fileobject.read() htmlParse=MyHTMLParser() htmlParse.feed(data)
path = srcList.pop() if not "__jpeg" in path: v = HttpClient() v.DownloadFile(path,"/Users/zhouxq/images/"+pathdir) print path if __name__ == "__main__": urlStr = raw_input("url:") urlList = urlStr.split("/") pathdir = urlList[len(urlList)-1] v = HttpClient() value = v.Get(urlStr,urlStr) #r1 = re.compile(r"http://\S*\.jpe*g") from MyHTMLParser import MyHTMLParser parser = MyHTMLParser() value = value.decode('gbk').encode('utf-8') print value parser.feed(value,"input") nodes = parser.get_nodes() print nodes srcList = [] for node in nodes: for attr in node["attrs"]: if attr == "src": srcList.append(node["attrs"][attr]) #print node["attrs"][attr]
def parseItem(self, htmlText): parser = MyHTMLParser(htmlText, self.requestHandler) return parser.getJson()
''' Current driver simple HTMl processor. which builds DOM tree. ''' from MyHTMLParser import MyHTMLParser mockData = "<html><head><title>This is the</title></head><body><h1>This<br /><>is the <span>header</span></h1></body></html>" cursor = MyHTMLParser() cursor.feed(mockData) #builds tree cursor.printTree() #prints tree! def findElementsByClass(className): return cursor.findElementsByClass(className)
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' ) req.add_header('Referer', 'http://gupiao.jd.com/find/12195') req.add_header('Host', 'gupiao.jd.com') req.add_header('Origin', 'http://gupiao.jd.com') req.add_header( 'Cookie', 'TrackID=1zjctpUkfXiPPpd2-FlJw52fq9gkx9v0WGqH_4sECdaGDpJ8D_58Bqx-Bx4HQsVMYTsT5X4AEec9ZtKVXPzJEMA; pinId=EX7C17pLL2_bXrUjzBWQTQ; __jdv=204210054|direct|-|none|-|1531620946230; _jrda=3; sec_flag=e125e94ccd30d095203da363b24adad3; sec_addr=c0a8006c; wlfstk_smdl=uj4fvqhhhqq66p2ddnrgf4vw8a2cggkb; 3AB9D23F7A4B3C9B=XG5I3N4FBWQZLN7HPAC56MKB755NV4K4D6CA6ICAOGCMBJBKMFJPJFYCRFOUFX7YP4IHFLD3YJJESRXWWTFXSHEVFM; __jda=204210054.1495960752486274042302.NaN.1525092662.1531620946.23; __jdb=204210054.10.1495960752486274042302|23.1531620946; __jdc=204210054; __jdu=1495960752486274042302; _jrdb=1531621024187' ) req.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') response = urllib2.urlopen(req) string = response.read().replace("\n", "").replace("\t", "").replace( " ", "").replace("%", "") # print string htmlParser = MyHTMLParser() htmlParser.feed(string) # 对象转Json parserDict = htmlParser.__dict__ try: parserDict.pop('interesting') parserDict.pop('lasttag') parserDict.pop('lineno') parserDict.pop('offset') parserDict.pop('cdata_elem') parserDict.pop('rawdata') parserDict.pop('_HTMLParser__starttag_text') parserDict.pop('index') # parserDict['buyNumStart'] = "100.00"
############################################### Fonctions ############################################################ def creatDictAuth(tab): d = dict() for i in range(len(tab.author)): if len(tab.author[i]) >0: if tab.author[i][0] not in d.keys() : d[tab.author[i][0]] = list() for j in range(1,len(tab.author[i])): d[tab.author[i][0]].append(tab.author[i][j]) return d parser = MyHTMLParser() parser = parserFichier(parser, 'dblp.xml') dict_auth = creatDictAuth(parser) # dictionnaire auteurs co_auteurs F = {'author', 'year','journal', 'title', 'co_authors'} def sortTab(tab): d = list() for i in range(len(tab)): d.append([tab[i],i]) d = sorted(d) return d def orderTab(order): if order.lower() == "author":
def image_src(flickr_description): p = MyHTMLParser() p.feed(flickr_description) return p.src
# coding=utf-8 import re import urllib2 from MyHTMLParser import MyHTMLParser url = 'http://ru.dhgate.com/' if __name__ == "__main__": data = urllib2.urlopen(url).read() hp = MyHTMLParser() hp.feed(data) hp.close() for link in hp.links: print link a = urllib2.urlopen(link).getcode() print a,link
def parseProfile(response): htmlText = response.read() parser = MyHTMLParser(htmlText) return parser.getJson()
def __init__(self, html): print "1" MyHTMLParser.__init__(self, html)
class EmailPrser(): DocumentCounter = 0 TestingDataInPercent = 25 ######################### Constructor ####################################### def __init__(self, emailDirPath, spamMappingFilePath): self._emailDirPath = emailDirPath self._spamIdentifierMapFilePath = spamMappingFilePath # {'inmail.1' :'Spam', 'inmail.10':'Ham',...........'inmail.200':'Spam'} self._emailFileNameToSpamOrHamMap = {} self._emailFileNameToSpamOrHamMap = self.__LoadFileNameToSpamOrHamMapping__( self._spamIdentifierMapFilePath) self._myHtmlParserObj = MyHTMLParser() self.fp = "test.txt" if os.path.isfile(self.fp): os.remove(self.fp) with open(self.fp, 'w') as handle: handle.write(str(datetime.datetime.now())) self.bulkList = [] self._ESMgrObject = ElasticSearchManager(Resource.INDEX_NAME, Resource.TYPE_NAME) ############################################################################## def __LoadFileNameToSpamOrHamMapping__(self, spamIdentifierMapFilePath): fileNameToSpamOrHamMapping = {} with open(spamIdentifierMapFilePath, 'r') as handle: for aLine in handle: # aLine -> spam ../data/inmail.1 SpamOrHam, relativeFilePath = aLine.split(' ') fileName = (relativeFilePath.split('/')[-1]).strip('\n') fileNameToSpamOrHamMapping[fileName] = SpamOrHam return fileNameToSpamOrHamMapping def __IndexAllEmailsInDirectory__(self, emailDirPath): print "All emails are present in directory -> ", emailDirPath fileCounter = -1 print '\n' bulkList = [] i = 0 for root, dirs, files in os.walk(emailDirPath): for file in files: # To find which document is missing from the email documents directory # i = i + int(os.path.basename( os.path.join(root, file)).split('.')[1]) # print i # continue emailFilePath = os.path.join(root, file) fileCounter += 1 fileName = os.path.basename(emailFilePath) # if len(fileName) == 11 or len(fileName) == 10: LabelSpamOrHam = self.__GetLabelAsSpamOrHam__(fileName) SplitTrainOrTest = self.__GetSplitAsTrainOrTest__(fileCounter) print "Cleaning...", fileCounter + 1, '/ 75149. ', fileName, LabelSpamOrHam, SplitTrainOrTest # emailFilePath = 'C:/Users/vikas/Dropbox/[email protected]/hw7/Input/trec07p/data\inmail.1885' emailContent = self.__GetEmailContent__(emailFilePath) # print emailContent # self.__IndexesEmailDoc__(fileName, emailContent, LabelSpamOrHam, SplitTrainOrTest) logicalDocumentForElasticSearch = self.__ConsituteDocument__( fileName, emailContent, LabelSpamOrHam, SplitTrainOrTest) bulkList.append(logicalDocumentForElasticSearch) # if emailFilePath == 'C:/Users/vikas/Dropbox/[email protected]/hw7/Input/trec07p/data\inmail.1884': # print "220" # print logicalDocumentForElasticSearch # exit() # self.__IndexesDocsInBulk__(bulkList) # exit() sleep(.5) print "Indexing all email in bulk..." self.__IndexesDocsInBulk__(bulkList) exit() res = self._ESMgrObject.__CurrentIndexStats__() print str(res["count"]) + "/ 75149", "documents indexed.\n" def __GetEmailContent__(self, filePath): self._myHtmlParserObj = MyHTMLParser() emailContent = "" with open(filePath, 'r') as handle: emailMessage = email.message_from_file(handle) emailBody = "" if emailMessage.is_multipart(): for part in emailMessage.walk(): if part.get_content_type( ) == "text/html" or part.get_content_type( ) == "text/plain": partPayload = part.get_payload() emailBody = emailBody + ' ' + partPayload else: if emailMessage.get_content_type( ) == "text/html" or emailMessage.get_content_type( ) == "text/plain": emailBody = emailMessage.get_payload() # Cleaning email content emailSubject = '' if emailMessage.has_key('subject'): emailSubject = self.__CleanEmailContent__( emailMessage['subject']) emailContent = self._myHtmlParserObj.GetParsedContentFromHtml( emailBody) emailContent = str(emailSubject) + " " + str(emailContent) emailContent = self.__CleanEmailContent__(emailContent) return emailContent def __CleanEmailContent__(self, emailContent): # Remove new line char emailContent = emailContent.replace('\n', ' ') # Remove other than alphabets and numbers emailContent = re.sub('[^a-zA-Z0-9\n]', ' ', emailContent) # all words in lower case emailContent = emailContent.lower() # Remove multiple spaces between words emailContent = re.sub(' +', ' ', str(emailContent)) return emailContent def __GetLabelAsSpamOrHam__(self, fileName): return self._emailFileNameToSpamOrHamMap[fileName] def __GetSplitAsTrainOrTest__(self, fileCounter): everyNthNoForTest = 100 / EmailPrser.TestingDataInPercent TrainOrTest = 'train' if (fileCounter + 1) % everyNthNoForTest == 0: TrainOrTest = 'test' return TrainOrTest def __IndexesEmailDoc__(self, fileName, emailContent, SpamOrHam, TrainOrTest): self._ESMgrObject.__IndexDoc__(fileName, emailContent, SpamOrHam, TrainOrTest) def __ConsituteDocument__(self, fileName, emailContent, SpamOrHam, TrainOrTest): action = { "_index": Resource.INDEX_NAME, '_type': Resource.TYPE_NAME, '_id': fileName, '_source': { "text": emailContent, "label": SpamOrHam, "split": TrainOrTest, "name": fileName } } return action def __IndexesDocsInBulk__(self, bulkList): self._ESMgrObject.__IndexBulkDoc__(bulkList)
import fileinput import requests import time import curses import sys, math import pylibs.pycurses_widgets from MyHTMLParser import MyHTMLParser niceInterval = 1 detailparser = MyHTMLParser() #stdscr = curses.initscr() def on_pdf(url): print url sys.stdout.flush() def loopit(): for line in fileinput.input(): #print 'Retrieving link list for detail page...',line r = requests.get(line.rstrip()) detailparser.feed(r.text) time.sleep(niceInterval) detailparser.set_pdf_handler(on_pdf) loopit() #curses.wrapper(loopit)
# MYHTMLParser from MyHTMLParser import MyHTMLParser class MyHTMLParser(HTMLParser): def start_tag(self, tag, attrs): print("encountered a start tag: ", tag) def end_tag(self, tag): print("encountered an end tag : ", tag) def handel_data(self, data): print("encountered some data is : ", data) parser = MyHTMLParser() parser.feed = ('<html><head><title>test</title></head>' "<body><h1>parse me!</h1></body></html>")
def backupPrivateMessages(myTuenti, email, password): printStarting('mensajes privados') print '| Obteniendo identificadores de tus mensajes privados' print '| (esto llevara algun tiempo)' messages = myTuenti.getInbox(0) totalMessages = int(messages[0]['num_threads']) keys = [] maxFill = len(str(totalMessages)) iters = totalMessages / 10.0 if math.fmod(iters, 1) != 0.0: iters += 1 iters = int(iters) for i in range(0, iters): messages = myTuenti.getInbox(i) for message in messages[0]['threads']: keys.append(message['key']) sleep(0.5) s = requests.Session() r = s.get('https://m.tuenti.com/?m=Login', verify=False) csrf = re.findall('name="csrf" value="(.*?)"', r.text)[0] data = { 'csrf': csrf, 'tuentiemailaddress': email, 'password': password, 'remember': 1 } s.post('https://m.tuenti.com/?m=Login&f=process_login', data) r = s.get("https://m.tuenti.com/?m=Profile&func=my_profile", verify=False) if r.text.find('email') != -1: print '| E-mail o password incorrectos' raw_input('| Pulsa ENTER para continuar') return rootPath = os.getcwd() theJoinPath = os.path.join(rootPath, 'privados') if not os.path.exists(theJoinPath): print '| Creando directorio donde se alojaran los mensajes privados...' os.makedirs(theJoinPath) print '| Directorio creado' os.chdir(theJoinPath) counter = 0 parser = MyHTMLParser() for key in keys: counter += 1 percent = 100 * counter / totalMessages print '| [' + str(percent) + '%] Descargando mensaje ' + \ str(counter) + ' de ' + str(totalMessages) + '...' urlName = 'https://m.tuenti.com/?m=messaging&func=view_thread&thread_id=' urlName += key + '&box=inbox&view_full=1' r = s.get(urlName, verify=False) sleep(0.5) parser.setFile(string.zfill(counter, maxFill)) parser.feed(r.text) os.chdir(rootPath)