Python MyHTMLParser示例，MyHTMLParser.MyHTMLParser Python示例

示例#1

0

显示文件

文件： NorroenDyrd.py 项目： freysgodi/NorroenDyrd

    def setIndex():
        from MyHTMLParser import MyHTMLParser
        import os

        HTMLlist = []
        nobody = [
            "index.html", "index1.html", "index2.html", "index3.html",
            "index4.html", "ru3.html", "ru2.html", "ru1.html", "rut.html",
            "rus.html", "ru.html"
        ]
        for (parent, d, f) in os.walk(NorroenDyrd.mirror):
            for fn in f:
                if fn in nobody:
                    continue
                elif fn.find(".html") == -1:
                    continue
                elif os.path.join(parent, fn) in HTMLlist:
                    continue
                else:
                    HTMLlist.append(os.path.join(parent, fn))
        html = []
        for h in HTMLlist:
            entry = {}
            with open(h, "r", encoding="utf-8") as f:
                html = f.readlines()
            parser = MyHTMLParser()
            for i in html:
                parser.feed(i)
            entry["path"] = h.replace(NorroenDyrd.mirror, NorroenDyrd.base)
            entry["text"] = parser.plaintext
            entry["title"] = parser.title
            NorroenDyrd.index.append(entry)
            del parser

示例#2

0

显示文件

    def __GetEmailContent__(self, filePath):
        self._myHtmlParserObj = MyHTMLParser()
        emailContent = ""
        with open(filePath, 'r') as handle:
            emailMessage = email.message_from_file(handle)

            emailBody = ""
            if emailMessage.is_multipart():
                for part in emailMessage.walk():

                    if part.get_content_type(
                    ) == "text/html" or part.get_content_type(
                    ) == "text/plain":
                        partPayload = part.get_payload()
                        emailBody = emailBody + ' ' + partPayload
            else:
                if emailMessage.get_content_type(
                ) == "text/html" or emailMessage.get_content_type(
                ) == "text/plain":
                    emailBody = emailMessage.get_payload()

            # Cleaning email content
            emailSubject = ''
            if emailMessage.has_key('subject'):
                emailSubject = self.__CleanEmailContent__(
                    emailMessage['subject'])

            emailContent = self._myHtmlParserObj.GetParsedContentFromHtml(
                emailBody)

            emailContent = str(emailSubject) + " " + str(emailContent)
            emailContent = self.__CleanEmailContent__(emailContent)

            return emailContent

示例#3

0

显示文件

class Inverter():
    #global doc_id, term_count

    def __init__(self, config):
        self.config = config
        self.ht = myhashtable(config)
        self.htmlparser = MyHTMLParser(self.config, self.ht)
        self.start_batch_processing()
        self.write_file_map()
        self.ht.write_posting_file(term_count)
        self.ht.write_hash_table()

    def start_batch_processing(self):
        file_id = 0

        for in_file in os.listdir(self.config['str_src_dir']):
            #if in_file not in ['medium.html','simple.html']: continue #for testing
            with open(self.config['str_src_dir'] + in_file, 'r') as f:
                doc_id[file_id] = in_file
                term_count[file_id] = 0
                self.htmlparser.feed(f.read(), file_id)
                file_id += 1

    # writing doc_id <--> doc_name file
    def write_file_map(self):
        #writing document id file
        with open(
                self.config['str_dst_dir'] +
                self.config['str_doc_id_file_name'], 'wb+') as f:
            for did, txt in doc_id.iteritems():
                f.write('{0:0>{1}d} {2:'
                        '<{3}s}\n'.format(did,
                                          self.config['file_id_encoding_len'],
                                          txt, self.config['file_name_len']))

示例#4

0

显示文件

文件： main.py 项目： charsyam/pythoncrawl

def f(idx, q,r):
    path = "data%s"%(idx)
    os.makedirs(path)
    while True:
        item = q.get()
        if( item.item_type == ITEM_QUIT ):
            break;

        count = 0
        localQueue = Queue()
        current = item.data
        while True:
            print current
            fo = urlopen(current)
            data = fo.read()
            name = "%s/%s"%(path,count)
            fw = open( name, "w" )
            count = count + 1
            fw.write(data)
            fw.close()
            fo.close()
            p = MyHTMLParser()
            try:
                p.feed(data)
            except:
                pass

            for href in p.hrefs:
                print item.data, ": ", href

            try:
                current = localQueue.get_nowait()
            except:
                break;

示例#5

0

显示文件

 def __init__(self, config):
     self.config = config
     self.ht = myhashtable(config)
     self.htmlparser = MyHTMLParser(self.config, self.ht)
     self.start_batch_processing()
     self.write_file_map()
     self.ht.write_posting_file(term_count)
     self.ht.write_hash_table()

示例#6

0

显示文件

文件： boto_funcs.py 项目： Yanivba18/chatbot

def create_journey_instructions(steps):
    parser = MyHTMLParser()  # HTML parser for directions API data
    instruct = ""
    for step in steps:
        parser.feed(step['html_instructions'])
        instruct += parser.get_data() + ">>>>>"
    print(instruct)
    return instruct

示例#7

0

显示文件

def table_maker(pd_row):
    #read data from html-like file
    h = MyHTMLParser()
    h.feed(pd_row['data'])
    soup = soupparser()
    p = soup.handle_data(pd_row)
    dividendpershare = [h.type_dividendpershare, h.asofdate_dividendpershare, h.reporttype_dividendpershare,
                        h.period_dividendpershare, h.currency_dividendpershare, p.data_dividendpershare,
                        h.exdate_dividendpershare, h.recorddate_dividendpershare, h.paydate_dividendpershare,
                        h.declarationdate_dividendpershare]
    totalrevenue = [h.type_totalrevenue, h.asofdate_totalrevenue, h.reporttype_totalrevenue,
                    h.period_totalrevenue, h.currency_totalrevenue, p.data_totalrevenue,
                    h.exdate_totalrevenue, h.recorddate_totalrevenue, h.paydate_totalrevenue,
                    h.declarationdate_totalrevenue]
    dividend = [h.type_dividend, h.asofdate_dividend, h.reporttype_dividend,
                h.period_dividend, h.currency_dividend, p.data_dividend,
                h.exdate_dividend, h.recorddate_dividend, h.paydate_dividend,
                h.declarationdate_dividend]
    eps = [h.type_eps, h.asofdate_eps, h.reporttype_eps,
           h.period_eps, h.currency_eps, p.data_eps,
           h.exdate_eps, h.recorddate_eps, h.paydate_eps, h.declarationdate_eps]

    #sort data and make it into a dataframe
    names = ['type', 'asofdate', 'reporttype', 'period', 'currency','data',
             'exdate', 'recorddate', 'paydate', 'declarationdate']
    def make_dataframe(list1):
        dict1 = {names[i]: list1[i] for i in range(10)}
        dataframe1 = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in dict1.items()]))
        dataframe1 = dataframe1.fillna(method='ffill')
        return dataframe1

    dividendpershare_dataframe = make_dataframe(dividendpershare)
    totalrevenue_dataframe = make_dataframe(totalrevenue)
    dividend_dataframe = make_dataframe(dividend)
    eps_dataframe = make_dataframe(eps)

    table1 = pd.concat([dividendpershare_dataframe, totalrevenue_dataframe,
                        dividend_dataframe, eps_dataframe], axis = 0, ignore_index=True)

    reqId1 = [pd_row['reqId']] * len(table1['type'])
    table1['reqId'] = pd.Series(np.array(reqId1), index = table1.index)

    #format each column to put into sql
    table1['type'] = table1['type'].astype(str)
    table1['reporttype'] = table1['reporttype'].astype(str)
    table1['period'] = table1['period'].astype(str)
    table1['asofdate'] = pd.to_datetime(table1['asofdate'])
    table1['exdate'] = pd.to_datetime(table1['exdate'])
    table1['recorddate'] = pd.to_datetime(table1['recorddate'])
    table1['paydate'] = pd.to_datetime(table1['paydate'])
    table1['declarationdate'] = pd.to_datetime(table1['declarationdate'])

    #drop_duplicate line
    table1 = table1.drop_duplicates()

    return table1

示例#8

0

显示文件

文件： Facade.py 项目： hrkg/pycurl

    def downloadPictures():

        curl = MyCurl()
        curl.set_url(MyUriEncode.getUrl())
        
        buffer = BytesIO()
        buffer = curl.set_buffer(buffer)
        
        curl.exec()
        curl.close()
        
        body = buffer.getvalue().decode('utf-8')
        
        parser = MyHTMLParser() 
        parser.feed(body)

示例#9

0

显示文件

文件： main.py 项目： SecT/BulbapediaApp

def getListOfPokemonPages():

    pokemonListAddress = "http://bulbapedia.bulbagarden.net/wiki/List_of_Pok%C3%A9mon_by_National_Pok%C3%A9dex_number"

    pokeListResponse = urllib.request.urlopen(pokemonListAddress)
    pokeListPage = str(pokeListResponse.read())

    parser = MyHTMLParser()
    parser.feed(pokeListPage)

    baseBulbapediaAdress = "http://bulbapedia.bulbagarden.net"

    for i, link in enumerate(parser.pokeListParser.pokemonURLs):
        parser.pokeListParser.pokemonURLs[i] = baseBulbapediaAdress + link

    return  parser.pokeListParser.pokemonURLs

示例#10

0

显示文件

    def dataFetcher(self, jobname=None):
        global masterList
        global masterDictionary
        global jobName
        global prePost
        # MEATHOD OVERLOAING
        if jobname == None:
            mydatafetcher = URLCreator(jobname=self.jobName)
        else:
            mydatafetcher = URLCreator(jobname=jobname)

        contents = mydatafetcher.loadUrl()
        parser = MyHTMLParser()
        root = parser.feed(contents)
        postdependencyList = list(set(parser.postList))
        predependencyList = list(set(parser.preList))
        # print('pre:',predependencyList)
        return predependencyList, postdependencyList

示例#11

0

显示文件

    def __init__(self, emailDirPath, spamMappingFilePath):
        self._emailDirPath = emailDirPath
        self._spamIdentifierMapFilePath = spamMappingFilePath

        # {'inmail.1' :'Spam', 'inmail.10':'Ham',...........'inmail.200':'Spam'}
        self._emailFileNameToSpamOrHamMap = {}
        self._emailFileNameToSpamOrHamMap = self.__LoadFileNameToSpamOrHamMapping__(
            self._spamIdentifierMapFilePath)

        self._myHtmlParserObj = MyHTMLParser()
        self.fp = "test.txt"

        if os.path.isfile(self.fp):
            os.remove(self.fp)
        with open(self.fp, 'w') as handle:
            handle.write(str(datetime.datetime.now()))
        self.bulkList = []

        self._ESMgrObject = ElasticSearchManager(Resource.INDEX_NAME,
                                                 Resource.TYPE_NAME)

示例#12

0

显示文件

文件： RunCrawler.py 项目： liuhantao9/Information_Retrieval

    def crawl(self, depth, frontier):
        if depth > self.maxdepth:
            return

        nextLevelFrontier = list()
        for url in frontier:
            # only parse when the number of crawled pages are not exceeding maximum
            if len(self.crawledlist
                   ) < self.numPages and url not in self.crawledlist:
                # pass in the URL and create the request
                request = req.Request(
                    url,
                    headers={
                        "User-Agent":
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
                    })
                try:
                    time.sleep(1)
                    # send the request to the url and get the response
                    data = req.urlopen(request).read().decode("utf-8")
                    parser = MyHTMLParser()
                    parser.feed(data)
                    # Handling Nonetype
                    if self.record(url, depth):
                        self.create_web_file(data, len(self.crawledlist))
                        print(url)
                        print("Finished:", len(self.crawledlist), "files")
                        print("current depth: ", depth)
                        nextLevelFrontier += parser.urls
                # try to catch errors when encounter
                except urllib.error.HTTPError as err:
                    # handling page not found error
                    if err.code == 404:
                        continue
                    else:
                        raise
        self.crawl(depth + 1, nextLevelFrontier)

示例#13

0

显示文件

文件： ImageAltText.py 项目： djghostghost/hybris_impex_tool

def getImageTagFromHtml(file):

	MyHTMLParser.feed(file)

	return MyHTMLParser.map

示例#14

0

显示文件

文件： parse.py 项目： Vaporbook/iascraper

# grab links from html stdin input and canonicalize them
# spit them out on newlines for processing

from MyHTMLParser import MyHTMLParser
import fileinput
import sys
import argparse

argparser = argparse.ArgumentParser(description='Parse an html dump of an IA page for matching links')
#argparser.add_argument('--path', dest='path',
#                   default='/',
#                   help='path to match in links')

args = argparser.parse_args()

parser = MyHTMLParser()
html = sys.stdin.read()
parser.feed(html)
for link in parser.get_details():
	print link

示例#15

0

显示文件

文件： ImageAltText.py 项目： djghostghost/hybris_impex_tool

			fname,fext=os.path.splitext(file)

			if (fext in s for s in filetype):
				filelist.append(os.path.join(rootpath,file))

	return filelist



def getImageTagFromHtml(file):

	MyHTMLParser.feed(file)

	return MyHTMLParser.map


def getImageAltTextByImageName(image):
	return

if __name__=="__main__":


	files=getAllHtmlFiles(rootpath,filetype)
	for file in files:
		fileobject=open(file,'r')
		data=fileobject.read()
		htmlParse=MyHTMLParser()
		htmlParse.feed(data)

示例#16

0

显示文件

文件： clThread2.py 项目： oscarzhouxq/learngit

                path = srcList.pop()
                if not "__jpeg" in path:
                
                    v = HttpClient()
                    v.DownloadFile(path,"/Users/zhouxq/images/"+pathdir)
                    print path

if __name__ == "__main__":
    urlStr = raw_input("url:")
    urlList = urlStr.split("/")
    pathdir = urlList[len(urlList)-1]
    v = HttpClient()
    value = v.Get(urlStr,urlStr)
    #r1 = re.compile(r"http://\S*\.jpe*g")
    from MyHTMLParser import MyHTMLParser
    parser = MyHTMLParser()
    value = value.decode('gbk').encode('utf-8')
    print value
    
   
    parser.feed(value,"input")
    nodes = parser.get_nodes()
    print nodes
    srcList = []
    for node in nodes:
        for attr in node["attrs"]:
            if attr == "src":
                srcList.append(node["attrs"][attr])
                #print node["attrs"][attr]

示例#17

0

显示文件

 def parseItem(self, htmlText):
     parser = MyHTMLParser(htmlText, self.requestHandler)
     return parser.getJson()

示例#18

0

显示文件

文件： htmlProcessor.py 项目： iampueroo/small-projects

'''
Current driver simple HTMl processor.
which builds DOM tree.
'''
from MyHTMLParser import MyHTMLParser

mockData = "<html><head><title>This is the</title></head><body><h1>This<br /><>is the <span>header</span></h1></body></html>"

cursor = MyHTMLParser()
cursor.feed(mockData) #builds tree

cursor.printTree()	#prints tree!	

def findElementsByClass(className):
	return cursor.findElementsByClass(className)

示例#19

0

显示文件

文件： FindHtml.py 项目： ImmortalHalfWu/StockOrder

        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
    )
    req.add_header('Referer', 'http://gupiao.jd.com/find/12195')
    req.add_header('Host', 'gupiao.jd.com')
    req.add_header('Origin', 'http://gupiao.jd.com')
    req.add_header(
        'Cookie',
        'TrackID=1zjctpUkfXiPPpd2-FlJw52fq9gkx9v0WGqH_4sECdaGDpJ8D_58Bqx-Bx4HQsVMYTsT5X4AEec9ZtKVXPzJEMA; pinId=EX7C17pLL2_bXrUjzBWQTQ; __jdv=204210054|direct|-|none|-|1531620946230; _jrda=3; sec_flag=e125e94ccd30d095203da363b24adad3; sec_addr=c0a8006c; wlfstk_smdl=uj4fvqhhhqq66p2ddnrgf4vw8a2cggkb; 3AB9D23F7A4B3C9B=XG5I3N4FBWQZLN7HPAC56MKB755NV4K4D6CA6ICAOGCMBJBKMFJPJFYCRFOUFX7YP4IHFLD3YJJESRXWWTFXSHEVFM; __jda=204210054.1495960752486274042302.NaN.1525092662.1531620946.23; __jdb=204210054.10.1495960752486274042302|23.1531620946; __jdc=204210054; __jdu=1495960752486274042302; _jrdb=1531621024187'
    )
    req.add_header('Content-Type',
                   'application/x-www-form-urlencoded; charset=UTF-8')
    response = urllib2.urlopen(req)
    string = response.read().replace("\n", "").replace("\t", "").replace(
        " ", "").replace("%", "")
    # print string
    htmlParser = MyHTMLParser()
    htmlParser.feed(string)

    # 对象转Json
    parserDict = htmlParser.__dict__

    try:
        parserDict.pop('interesting')
        parserDict.pop('lasttag')
        parserDict.pop('lineno')
        parserDict.pop('offset')
        parserDict.pop('cdata_elem')
        parserDict.pop('rawdata')
        parserDict.pop('_HTMLParser__starttag_text')
        parserDict.pop('index')
        # parserDict['buyNumStart'] = "100.00"

示例#20

0

显示文件

文件： MyFunctions.py 项目： lioralu/dblpAPI

############################################### Fonctions ############################################################

def creatDictAuth(tab):
    d = dict()
    for i in range(len(tab.author)):
    	if len(tab.author[i]) >0:
    		if tab.author[i][0] not in d.keys() :
    			d[tab.author[i][0]] = list() 
    		for j in range(1,len(tab.author[i])):   		
    			d[tab.author[i][0]].append(tab.author[i][j])
    return d



parser = MyHTMLParser()
parser = parserFichier(parser, 'dblp.xml')	
dict_auth = creatDictAuth(parser)    # dictionnaire auteurs co_auteurs
F = {'author', 'year','journal', 'title', 'co_authors'}


def sortTab(tab):
    d = list()
    for i in range(len(tab)):
    	d.append([tab[i],i])
    d = sorted(d)
    return d

def orderTab(order):
        if order.lower() == "author":

示例#21

0

显示文件

文件： index.py 项目： stevepatter/flask-site

def image_src(flickr_description):
    p = MyHTMLParser()
    p.feed(flickr_description)
    return p.src

示例#22

0

显示文件

文件： 1pop.py 项目： hockbase/backup

# coding=utf-8

import re
import urllib2
from MyHTMLParser import MyHTMLParser

url = 'http://ru.dhgate.com/'

if __name__ == "__main__":
    data = urllib2.urlopen(url).read()
    hp = MyHTMLParser()
    hp.feed(data)
    hp.close()
    for link in hp.links:
        print link
        a = urllib2.urlopen(link).getcode()
        print a,link

示例#23

0

显示文件

文件： server.py 项目： danabig/LinkedinCrawler

def parseProfile(response):
    htmlText = response.read()
    parser = MyHTMLParser(htmlText)
    return parser.getJson()

示例#24

0

显示文件

文件： TelnetBBSParser.py 项目： shawnXiao/campuse4me

 def __init__(self, html):
     print "1"
     MyHTMLParser.__init__(self, html)

示例#25

0

显示文件

class EmailPrser():
    DocumentCounter = 0
    TestingDataInPercent = 25

    #########################   Constructor #######################################
    def __init__(self, emailDirPath, spamMappingFilePath):
        self._emailDirPath = emailDirPath
        self._spamIdentifierMapFilePath = spamMappingFilePath

        # {'inmail.1' :'Spam', 'inmail.10':'Ham',...........'inmail.200':'Spam'}
        self._emailFileNameToSpamOrHamMap = {}
        self._emailFileNameToSpamOrHamMap = self.__LoadFileNameToSpamOrHamMapping__(
            self._spamIdentifierMapFilePath)

        self._myHtmlParserObj = MyHTMLParser()
        self.fp = "test.txt"

        if os.path.isfile(self.fp):
            os.remove(self.fp)
        with open(self.fp, 'w') as handle:
            handle.write(str(datetime.datetime.now()))
        self.bulkList = []

        self._ESMgrObject = ElasticSearchManager(Resource.INDEX_NAME,
                                                 Resource.TYPE_NAME)

    ##############################################################################
    def __LoadFileNameToSpamOrHamMapping__(self, spamIdentifierMapFilePath):
        fileNameToSpamOrHamMapping = {}
        with open(spamIdentifierMapFilePath, 'r') as handle:
            for aLine in handle:
                # aLine -> spam ../data/inmail.1
                SpamOrHam, relativeFilePath = aLine.split(' ')
                fileName = (relativeFilePath.split('/')[-1]).strip('\n')
                fileNameToSpamOrHamMapping[fileName] = SpamOrHam

        return fileNameToSpamOrHamMapping

    def __IndexAllEmailsInDirectory__(self, emailDirPath):
        print "All emails are present in directory -> ", emailDirPath
        fileCounter = -1
        print '\n'
        bulkList = []
        i = 0
        for root, dirs, files in os.walk(emailDirPath):
            for file in files:
                # To find which document is missing from the email documents directory
                # i = i +   int(os.path.basename( os.path.join(root, file)).split('.')[1])
                # print i
                # continue

                emailFilePath = os.path.join(root, file)
                fileCounter += 1

                fileName = os.path.basename(emailFilePath)

                # if len(fileName) == 11 or len(fileName) == 10:
                LabelSpamOrHam = self.__GetLabelAsSpamOrHam__(fileName)
                SplitTrainOrTest = self.__GetSplitAsTrainOrTest__(fileCounter)

                print "Cleaning...", fileCounter + 1, '/ 75149. ', fileName, LabelSpamOrHam, SplitTrainOrTest
                # emailFilePath = 'C:/Users/vikas/Dropbox/[email protected]/hw7/Input/trec07p/data\inmail.1885'
                emailContent = self.__GetEmailContent__(emailFilePath)
                # print emailContent

                # self.__IndexesEmailDoc__(fileName, emailContent, LabelSpamOrHam, SplitTrainOrTest)
                logicalDocumentForElasticSearch = self.__ConsituteDocument__(
                    fileName, emailContent, LabelSpamOrHam, SplitTrainOrTest)
                bulkList.append(logicalDocumentForElasticSearch)

                # if emailFilePath == 'C:/Users/vikas/Dropbox/[email protected]/hw7/Input/trec07p/data\inmail.1884':
                #     print "220"
                #     print logicalDocumentForElasticSearch
                #     exit()

            #     self.__IndexesDocsInBulk__(bulkList)
            #     exit()

        sleep(.5)
        print "Indexing all email in bulk..."
        self.__IndexesDocsInBulk__(bulkList)
        exit()

        res = self._ESMgrObject.__CurrentIndexStats__()
        print str(res["count"]) + "/ 75149", "documents indexed.\n"

    def __GetEmailContent__(self, filePath):
        self._myHtmlParserObj = MyHTMLParser()
        emailContent = ""
        with open(filePath, 'r') as handle:
            emailMessage = email.message_from_file(handle)

            emailBody = ""
            if emailMessage.is_multipart():
                for part in emailMessage.walk():

                    if part.get_content_type(
                    ) == "text/html" or part.get_content_type(
                    ) == "text/plain":
                        partPayload = part.get_payload()
                        emailBody = emailBody + ' ' + partPayload
            else:
                if emailMessage.get_content_type(
                ) == "text/html" or emailMessage.get_content_type(
                ) == "text/plain":
                    emailBody = emailMessage.get_payload()

            # Cleaning email content
            emailSubject = ''
            if emailMessage.has_key('subject'):
                emailSubject = self.__CleanEmailContent__(
                    emailMessage['subject'])

            emailContent = self._myHtmlParserObj.GetParsedContentFromHtml(
                emailBody)

            emailContent = str(emailSubject) + " " + str(emailContent)
            emailContent = self.__CleanEmailContent__(emailContent)

            return emailContent

    def __CleanEmailContent__(self, emailContent):
        #  Remove new line char
        emailContent = emailContent.replace('\n', ' ')
        #  Remove other than alphabets and numbers
        emailContent = re.sub('[^a-zA-Z0-9\n]', ' ', emailContent)
        # all words in lower case
        emailContent = emailContent.lower()
        # Remove multiple spaces between words
        emailContent = re.sub(' +', ' ', str(emailContent))
        return emailContent

    def __GetLabelAsSpamOrHam__(self, fileName):
        return self._emailFileNameToSpamOrHamMap[fileName]

    def __GetSplitAsTrainOrTest__(self, fileCounter):
        everyNthNoForTest = 100 / EmailPrser.TestingDataInPercent
        TrainOrTest = 'train'
        if (fileCounter + 1) % everyNthNoForTest == 0:
            TrainOrTest = 'test'

        return TrainOrTest

    def __IndexesEmailDoc__(self, fileName, emailContent, SpamOrHam,
                            TrainOrTest):
        self._ESMgrObject.__IndexDoc__(fileName, emailContent, SpamOrHam,
                                       TrainOrTest)

    def __ConsituteDocument__(self, fileName, emailContent, SpamOrHam,
                              TrainOrTest):
        action = {
            "_index": Resource.INDEX_NAME,
            '_type': Resource.TYPE_NAME,
            '_id': fileName,
            '_source': {
                "text": emailContent,
                "label": SpamOrHam,
                "split": TrainOrTest,
                "name": fileName
            }
        }
        return action

    def __IndexesDocsInBulk__(self, bulkList):
        self._ESMgrObject.__IndexBulkDoc__(bulkList)

示例#26

0

显示文件

文件： crawl.py 项目： Vaporbook/iascraper

import fileinput
import requests
import time
import curses
import sys, math
import pylibs.pycurses_widgets

from MyHTMLParser import MyHTMLParser

niceInterval = 1

detailparser = MyHTMLParser()

#stdscr = curses.initscr()

def on_pdf(url):
    print url
    sys.stdout.flush()

def loopit():

    for line in fileinput.input():
        #print 'Retrieving link list for detail page...',line
        r = requests.get(line.rstrip())
        detailparser.feed(r.text)
        time.sleep(niceInterval)

detailparser.set_pdf_handler(on_pdf)
loopit()
#curses.wrapper(loopit)

示例#27

0

显示文件

# MYHTMLParser
from MyHTMLParser import MyHTMLParser


class MyHTMLParser(HTMLParser):
    def start_tag(self, tag, attrs):
        print("encountered a start tag: ", tag)

    def end_tag(self, tag):
        print("encountered an end tag : ", tag)

    def handel_data(self, data):
        print("encountered some data is : ", data)


parser = MyHTMLParser()

parser.feed = ('<html><head><title>test</title></head>'
               "<body><h1>parse me!</h1></body></html>")

示例#28

0

显示文件

文件： 20up.py 项目： CodingFree/20up

def backupPrivateMessages(myTuenti, email, password):
    printStarting('mensajes privados')
    
    print '| Obteniendo identificadores de tus mensajes privados'
    print '| (esto llevara algun tiempo)'
    messages = myTuenti.getInbox(0)
    totalMessages = int(messages[0]['num_threads'])
    keys = []
    
    maxFill = len(str(totalMessages))
    iters = totalMessages / 10.0
    if math.fmod(iters, 1) != 0.0:
        iters += 1
    iters = int(iters)
    
    for i in range(0, iters):
        messages = myTuenti.getInbox(i)
        for message in messages[0]['threads']:
            keys.append(message['key'])
        
        sleep(0.5)
    
    s = requests.Session()
    r = s.get('https://m.tuenti.com/?m=Login', verify=False)
    csrf = re.findall('name="csrf" value="(.*?)"', r.text)[0]

    data = { 'csrf': csrf, 'tuentiemailaddress': email, 'password': password, 'remember': 1 }
    s.post('https://m.tuenti.com/?m=Login&f=process_login', data)
    
    r = s.get("https://m.tuenti.com/?m=Profile&func=my_profile", verify=False)
    if r.text.find('email') != -1:
        print '| E-mail o password incorrectos'
        raw_input('| Pulsa ENTER para continuar')
        return
    
    rootPath = os.getcwd()
    theJoinPath = os.path.join(rootPath, 'privados')
    if not os.path.exists(theJoinPath):
        print '| Creando directorio donde se alojaran los mensajes privados...'
        os.makedirs(theJoinPath)
        print '| Directorio creado'
    os.chdir(theJoinPath)
    
    counter = 0
    parser = MyHTMLParser()
    for key in keys:
        counter += 1
        percent = 100 * counter / totalMessages
        print '| [' + str(percent) + '%] Descargando mensaje ' + \
              str(counter) + ' de ' + str(totalMessages) + '...'
        urlName = 'https://m.tuenti.com/?m=messaging&func=view_thread&thread_id='
        urlName += key + '&box=inbox&view_full=1'
        
        r = s.get(urlName, verify=False)
        
        sleep(0.5)

        parser.setFile(string.zfill(counter, maxFill))
        parser.feed(r.text)
        
    os.chdir(rootPath)