Пример #1
0
def load(id,tm,url,html,encode, xpaths):
    parser = HtmlParser(html,encode)
    parser.parse()

    db_sql =  "insert into job_detail(url,src_desc,type,title,\
    keywords,department,job_require,job_duty,\
    job_welfare,label,company,company_desc,\
    logo,salary,work_experience,\
    edu, field,location,head_count,pub_time) values("

    jd = page_pb2.JobDescription()
    js ="{\"pub_tm\":\"" + tm + "\","
    js = js + "\"url\":\"" + url + "\","
    for key in xpaths:
#        print "[ON]handle " + key
        xpath=xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath,encode)
        if (len(elements) == 0):
            print "[ERR] " + key
            continue
        value = elements[0][2].encode('utf-8')
        js += "\"" + key + "\":\"" + value + "\","
#        set_pb(jd,key,value)
    fp=open("./data/"+id+".dat",'w')
    fp.write(js.rstrip(',') + "}")
    fp.close()
Пример #2
0
def load(html, encode, xpaths):
    parser = HtmlParser(html, encode)
    parser.parse()
    for key in xpaths:
        xpath = xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath, encode)
        value = elements[0][2].encode('utf-8')
Пример #3
0
    def _toString(self):
        htmlParser = HtmlParser('https://www.worldometers.info/coronavirus/')
        htmlParser.parse()

        timeStr = time.strftime("%d %b %Y %H:%M:%S", time.gmtime())
        text = "Статистика зараженных на " + timeStr + "\nЗараженных: " + htmlParser.getContent()[0] + "\nУмерших: " + htmlParser.getContent()[1] + "\nВыздоровевших: " + htmlParser.getContent()[2]

        return text
Пример #4
0
def load(id,html,encode, xpaths):
    parser = HtmlParser(html,encode)
    parser.parse()
    jd = page_pb2.JobDescription()
    js ="{";
    for key in xpaths:
#        print "[ON]handle " + key
        xpath=xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath,encode)
        if (len(elements) == 0):
            print "[ERR] " + key
            continue
        value = elements[0][2].encode('utf-8')
        js += "\"" + key + "\":\"" + value + "\","
#        set_pb(jd,key,value)
    fp=open("./data/"+id+".dat",'w')
    fp.write(js.rstrip(',') + "}")
    fp.close()
Пример #5
0
def crawl(init_url):
    url_pool = UrlManager()
    downloader = Downloader()
    parser = HtmlParser()
    outputer = Outputer()
    temp_url = init_url
    while temp_url:
        driver = downloader.download(temp_url)
        content, temp_url = parser.parse(driver)
        outputer.write(content)
    outputer.close()
Пример #6
0
 def parse_feed(self, feed):
     'Extract list of articles from the feed.'
     articles = []
     htmlparser = HtmlParser()
     for e in feed.entries[:1]: # read just the first entry while debugging
         article = Article(source=e.author, title=e.title, link=e.link)
         content = htmlparser.parse(e.link)
         article.content = re.sub(r' -.*$', '', content)
         article.save() # and associated word frequencies
         articles.append(article)
     return articles
Пример #7
0
def load(id, html, encode, xpaths):
    parser = HtmlParser(html, encode)
    parser.parse()
    jd = page_pb2.JobDescription()
    js = "{"
    for key in xpaths:
        #        print "[ON]handle " + key
        xpath = xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath, encode)
        if (len(elements) == 0):
            print "[ERR] " + key
            continue
        value = elements[0][2].encode('utf-8')
        js += "\"" + key + "\":\"" + value + "\","


#        set_pb(jd,key,value)
    fp = open("./data/" + id + ".dat", 'w')
    fp.write(js.rstrip(',') + "}")
    fp.close()
Пример #8
0
    def diff_html_from_file(cls, fileName1, fileName2, encode):
        '''get different elements btw. two html files
        '''

        if fileName1 == "" or fileName2 == "":
            print "class differ : function :diff_html_from_file() fileName1 or fileName2 is null"
            return []

        html_str1 = file(fileName1, "rb").read()
        html_Parser1 = HtmlParser(html_str1, encode)
        elements1 = html_Parser1.parse()
        html_Parser1.saveElementsToFile(elements1, "./tmp1.txt")

        html_str2 = file(fileName2, "rb").read()
        html_Parser2 = HtmlParser(html_str2, encode)
        elements2 = html_Parser2.parse()
        html_Parser2.saveElementsToFile(elements2, "./tmp2.txt")

        diffs = cls.diff_txt_from_file("tmp1.txt", "tmp2.txt")
        return diffs
Пример #9
0
 def diff_html_from_file(cls, fileName1, fileName2,encode):
     '''get different elements btw. two html files
     '''
     
     if fileName1=="" or fileName2=="":
         print "class differ : function :diff_html_from_file() fileName1 or fileName2 is null"
         return []
         
     html_str1 = file(fileName1,"rb").read()
     html_Parser1 = HtmlParser(html_str1,encode)
     elements1 = html_Parser1.parse() 
     html_Parser1.saveElementsToFile(elements1,"./tmp1.txt")
     
     html_str2 = file(fileName2,"rb").read()
     html_Parser2 = HtmlParser(html_str2,encode)
     elements2 = html_Parser2.parse()
     html_Parser2.saveElementsToFile(elements2,"./tmp2.txt")
     
     
         
     diffs = cls.diff_txt_from_file("tmp1.txt", "tmp2.txt")   
     return diffs
Пример #10
0
class Harvester():
    
    def __init__(self, db, filename):
        'Harvest articles from the list of feeds in filename.'
        self.db = db
        self.filename = filename
        self.htmlparser = HtmlParser()
        feedlist = self.read_feed_list(filename)
        self.articles = self.parse_feedlist(feedlist)

    def read_feed_list(self, filename):
        '''
        Read the feed list from a CSV file. The first item of each line
        is the URL to an RSS feed.
        '''
        feedlist = []
        reader = csv.reader(open(filename, 'rb'))
        for line in reader:
            feedlist.append(line)
        return feedlist

    def parse_feed(self, entry):
        'Extract list of articles from the feed.'
        articles = []
        (url, publisher, publisher_location) = entry
        try:
            c = urlopen(url)
        except URLError:
            print 'Failed to fetch ' + url
        feed = feedparser.parse(c)
        # for e in feed.entries[:1]: # read just the first entry while debugging
        for e in feed.entries:
            image_link = None
            image_type = None
            for link in e.links:
                if link['rel'] == 'enclosure':
                    image_link = link['href']
                    image_type = link['type']
            article = Article(
                publisher=publisher,
                publisher_location=publisher_location,
                published_date=e.updated_parsed,
                title=e.title,
                link=e.link,
                image_link=image_link,
                image_type=image_type)
            content = self.htmlparser.parse(e.link)
            m = re.search(r'-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', content)
            if m:
                article.source = m.group(1)
            article.content = re.sub(r'(\\n)?\s*-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', '', content)
            article.store(self.db) # put article and word frequencies into couchdb
            articles.append(article)
        return articles

    def parse_feedlist(self, feedlist):
        'Parse the RSS feeds.'
        articles = []
        for entry in feedlist:
            articles += self.parse_feed(entry)
        return articles

    def __str__(self):
        print self.filename