예제 #1
0
def updateinfo():
    x = 0
    while x < 25:
        sleep(1)
        #BTC to Usd First
        File = open("fileBtcUsd.txt","a")
        myurl = urllib.urlopen(URL_BTC_USD)
        html_string = myurl.read()
        text = html2text(html_string).strip("{}")
        splitText = text.split(":")
        splitText[0] = 1 + x
        splitText[1] = float(splitText[1])
        File.write("\n%s" % str(splitText).strip("[]"))
        File.close()
        File = open("fileEosBtc.txt","a")
        myurl = urllib.urlopen(URL_EOS_BTC)
        html_string = myurl.read()
        text = html2text(html_string).strip("{}")
        splitText = text.split(":")
        splitText[0] = 1 + x
        splitText[1] = float(splitText[1])
        File.write("\n%s" % str(splitText).strip("[]"))
        File.close()
        x += 1
        print x
예제 #2
0
 def number_words(self, page):
     text = None
     try:
         text = html2text(page.encode('utf8', 'ignore'))
     except UnicodeDecodeError:
         text = html2text(page)
     return len(re.findall(r'\w+', text))
예제 #3
0
    def _parse_body(self, msg):
        content_type = msg.get_content_type()
        self.log.debug('Content-Type:' + content_type)
        
        if content_type == 'text/html':
            body = msg.get_payload(decode=True)
            charset = msg.get_content_charset()
 
            # need try:
            if charset != None:
                self.log.debug("charset:"+charset)
                body = self._to_unicode(body, charset)

            body = unicode(body)
            
            from stripogram import html2text, html2safehtml
            body = html2text(body)

        else:
            #body
            #self.log.debug(msg.get_content_type())
            body = msg.get_payload(decode=1)
            charset = msg.get_content_charset()

            # need try:
            if charset != None:
                self.log.debug("charset:"+charset)
                body = self._to_unicode(body,charset)
        self.body = body
예제 #4
0
파일: solution_03.py 프로젝트: rave78/MIR
def index(urls):
    """
    Goal:  Download a list of webpage
    
    Parameter:
    urls:  list of strings, which represent the address of each webpage 
    
    """    
    
    if not os.path.isdir('files'):
        os.makedirs('files')
    
    
    for webpage in urls:
        name = webpage.split('/')[-1]
        os.system("wget "+webpage+ " -q -O files/"+name)
        logging.info("Downloaded: "+ name )
        
    b_o_w = {}
    
    for web_file in os.listdir('files'):
        
        try:
            text_html = open('files/'+web_file,'r').read();
            text = [stem(word.lower()) for word in html2text(text_html).split()]
            b_o_w[web_file] = text
            logging.info("Tokenized: "+web_file)
        except :
            #Something strange happened with the webpage of New_York_City
            print ("There is a problem with "+web_file)
    
    index_file = open("index_file.pck", "w") 
    pickle.dump(b_o_w, index_file)
    index_file.close()
예제 #5
0
    def convert(self, doc, encoding, mimetype,
                logError=False, raiseException=False):
        """Convert PowerPoint document to raw text"""
        
        tmp_name = self.saveFile(doc)
        err = TmpFile('')
        if sys.platform == 'win32':
            html = self.execute('ppthtml "%s" 2> "%s"' % (
                tmp_name, str(err)))
        else:
            html = self.execute('ppthtml "%s" 2> "%s"' % (
                tmp_name, str(err)))
        
        try:
            errors = open(str(err), 'r+').read()
        except OSError:
            errors = ""
        if errors:
            if logError:
                LOG.warn('Converter %s experienced an error %s' % (
                    self.content_description, errors)
                )
            
            if raiseException:
                raise ConversionError(errors)

        return html2text(html,
                         ignore_tags=('img',),
                         indent_width=4,
                         page_width=80), 'iso-8859-15'
예제 #6
0
def textread(content):
    strtext = html2text(str(content))
    strtext = strtext.replace('\n', " ")
    strtext = strtext.replace('\"', "")
    strtext = strtext.replace('\'', "")
    strtext = strtext.lower()
    return strtext
예제 #7
0
파일: html.py 프로젝트: eaudeweb/naaya
 def convert(self, html):
     """Convert html data to raw text"""
     
     return html2text(html,
                      ignore_tags=('img',),
                      indent_width=0,
                      page_width=256)
예제 #8
0
 def _processPageBody(self, page_body):
     """Process the link body with strip-o-gram library catching only the
     page content.
     """
     ignored_tags = ('img', 'style')
     page_content = html2text(page_body, ignore_tags=ignored_tags)
     return page_content
예제 #9
0
def htmlToText(original_html):
    #clean_html = html2safehtml(original_html,valid_tags=("b", "a", "i", "br", "p"))
    # Don't process <img> tags, just strip them out. Use an indent of 4 spaces 
    # and a page that's 80 characters wide.
    text = html2text(original_html,ignore_tags=("img",),indent_width=4,page_width=80)
    #text = html2text(original_html)
    return text	    
예제 #10
0
파일: files.py 프로젝트: rdb/pyweekorg
def oneshot_upload(request, entry_id):
    entry = models.Entry.objects.filter(name__exact=entry_id)
    if not entry: return HttpResponse('Invalid entry *short* name')
    entry = entry[0]
    challenge = entry.challenge

    version = int(request.POST.get('version', '1'))
    if version < 2:
        return HttpResponse('Please update your pyweek-upload.py script')

    data = request.POST
    user = request.POST.get('user', '')
    if not user: return HttpResponse('Invalid login')

    user = models.User.objects.filter(username__exact=user)
    if not user: return HttpResponse('Invalid login')
    user = user[0]

    password = request.POST.get('password', '')
    if not user.check_password(password):
        return HttpResponse('Invalid login')

    # check authorisation
    if not user in entry.users.all() or not entry.isUploadOpen():
        return HttpResponse("You're not allowed to upload files!")

    # make sure user isn't sneeeky
    is_final = bool(request.POST.get('is_final', False)),
    if is_final and not challenge.isFinalUploadOpen():
        return HttpResponse('Final uploads are not allowed now')

    # avoid dupes
    if os.path.exists(
            os.path.join(MEDIA_ROOT, str(challenge.number), entry.name,
                         request.FILES['content_file'].name)):
        return HttpResponse('File with that filename already exists.')

    upload_file = request.FILES['content_file']
    file = models.File(
        challenge=challenge,
        entry=entry,
        user=user,
        created=datetime.datetime.now(models.UTC),
        content=upload_file,
        description=html2text(data.get('description', '')),
        is_final=bool(data.get('is_final', False)),
        is_screenshot=bool(data.get('is_screenshot', False)),
        thumb_width=0,
    )
    file.save()
    if file.is_final:
        entry.has_final = True
        entry.save()

    if data['is_screenshot']:
        try:
            _make_thumbnail(upload_file)
        except IOError as e:
            return HttpResponse('Error uploading screenshot: {}'.format(e))
    return HttpResponse('File added!')
예제 #11
0
  def analyze_results(self):
    #print "Put the code here to analyze the reviews"
    try:
        self.reset_data()
        if self.RatingSummary is not None:
            self.RatingSummary.pack_forget()
        self.dataText.pack(fill=Y)
        rdata=requests.get(self.URLtext.get()+self.requestCriteria)
        #url=self.URLtext.get()
        soup=BeautifulSoup(rdata.content)
        reviewSections=soup.findAll("div",{"class":"review-wrapper"})
        if len(reviewSections) !=0:
            reviewSections.pop(0)
        for reviewSection in reviewSections:
            reviewContent= reviewSection.findAll('p')[0]
            self.reviewList.append(html2text(reviewContent.text))
        #tkMessageBox.showinfo("URL entered",self.reviewList[0])
        self.dataText.delete('1.0',END)
        reviewText=""
        for review in self.reviewList:
            reviewText=reviewText+review+("\n"*3)
        self.dataText.insert(END, reviewText)
        reviewSentiments=classifyReviews(self.reviewList)
        for reviewSentiment in reviewSentiments:
            if(reviewSentiment)=="pos":
                self.positiveRatings+=1
            if(reviewSentiment)=="neg":
                self.negativeRatings+=1



    except:
        print "Unexpected error:", sys.exc_info()[0]
        raise
    def post(self):
        category = html2text(self.get_argument('data_name', ''))
        result = self.db.category.remove({'name': category}, safe=True)
        if result['n'] == 0:
            return self.response_json(0, 'Delete Fail')

        self.db.rss.update({'category': category}, {'$set': {'category': 'Other'}})
        return self.response_json(1, 'Success')
예제 #13
0
    def _strip_html(self, html):
	""" remove HTML for use in RSS """
	if html2text is not None:
	    it = ('img','a')
	    text = html2text(html, ignore_tags=it,
			     indent_width=4,page_width=80)
	    return text
	return html
예제 #14
0
 def _processPageBody(self, page_body, content_type):
     """Process the link body with strip-o-gram library catching only the
     page content.
     """
     # XXX Improve by extracting text from other content types
     if content_type and 'html' not in content_type:
         return ''
     ignored_tags = ('img', 'style')
     page_content = html2text(page_body, ignore_tags=ignored_tags)
     return page_content
예제 #15
0
def singlePageScrape(no):
    global places
    h = HTMLParser.HTMLParser()
    page = requests.get('https://www.list.co.uk/places/location:Glasgow(55.8621,-4.2465)/distance:10/page:' + no + "/#results'")
    tree = html.fromstring(page.text)
    buyers = tree.xpath('//h2[@class="head"]/text()')
    prices = tree.xpath('//span[@class="postal-code"]/text()')
    count = 0
    for item in buyers:
        places[str(item.encode('ascii','ignore'))] = str(html2text(prices[count]).encode('ascii','ignore'))
        count += 1
    def post(self):
        category = html2text(self.get_argument('category', ''))
        if len(category) == 0:
            return self.response_json(0, 'Category Name Required')

        if self.db.category.find_one({'name': category}):
            return self.response_json(0, 'Category Exists')

        if self.db.category.insert({'name': category}):
            return self.response_json(1, 'Success')

        return self.response_json(0, 'Failed')
예제 #17
0
	def handle(self, *args, **options):
		compiled_pattern = re.compile(PATTERN)
		#metro :
		print "metro"
		consolidated_text = ""
		for line in urllib2.urlopen(URL % METRO):
			consolidated_text += line
		print html2text(consolidated_text)[72:75]
		for result in compiled_pattern.findall(consolidated_text):
			(str_ligne, raison) = result
			incident = Incident()
			incident.line = Line.objects.get_or_create(name=str_ligne.strip())[0]
			incident.reason = raison.strip()
			incident.contributors = 'RATP'
		#rer :
		print "rer"
		consolidated_text = ""
		for line in urllib2.urlopen(URL % RER):
			consolidated_text += line

		for result in compiled_pattern.findall(consolidated_text):
			print result
예제 #18
0
파일: Robo.py 프로젝트: danielfl/thessearch
  def obtemHTML(self):
    try:
      httpconn = httplib.HTTPConnection(self.site)
      httpconn.request("GET", self.link)
      resp = httpconn.getresponse()
      html = resp.read()
      
      self.cache=self.cache+html2text(html)
    except:
      html = ""

    self.feed(html)
    if self.proximoLink() != '':
      self.obtemHTML()

      return html
예제 #19
0
    def obtemHTML(self):
        try:
            httpconn = httplib.HTTPConnection(self.site)
            httpconn.request("GET", self.link)
            resp = httpconn.getresponse()
            html = resp.read()

            self.cache = self.cache + html2text(html)
        except:
            html = ""

        self.feed(html)
        if self.proximoLink() != '':
            self.obtemHTML()

            return html
예제 #20
0
    def convert(self, doc, encoding=None, mimetype=None,
                logError=False, raiseException=False):

        # convert to unicode
        if not isinstance(doc, unicode):
            if not encoding:
                mo = charset_reg.search(doc)
                if mo is not None:
                    encoding = mo.group(1)
                else:
                    encoding = 'ascii' # guess
            doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        result = html2text(doc)

        # convert back to utf-8
        return result.encode('utf-8'), 'utf-8'
예제 #21
0
    def get_message_body(self, message):
        """
        Get the body of an email
        Recursively look for the body for different mimetypes
        Returns the body as text/plain
        """

        if 'payload' in message:
            return self.get_message_body(message['payload'])
        elif 'parts' in message:
            return self.get_message_body(message['parts'][0])
        else:
            data = base64.urlsafe_b64decode(message['body']['data'].encode('ASCII'))
            markdown_data = html2text(data)
            data = data.replace("\n", "<br/>")

            # return {markdown, html}
            return {'markdown': unicode(markdown_data, "ISO-8859-1"), 'html': unicode(data, "ISO-8859-1")} if markdown_data else {'html': unicode(data, "ISO-8859-1")}
예제 #22
0
def send_html_mail_nt(subject,
                      sender=settings.DEFAULT_FROM_EMAIL,
                      recip="",
                      context=None,
                      html_template="",
                      text_template="",
                      sender_name="",
                      html_content="",
                      text_content="",
                      recip_list=None,
                      sender_formatted=""):
    from stripogram import html2text
    from feedparser import _sanitizeHTML

    if not context: context = {}
    if html_template:
        html = render(context, html_template)
    else:
        html = html_content
    if text_template:
        text = render(context, text_template)
    else:
        text = text_content
    if not text:
        text = html2text(_sanitizeHTML(html, charset))

    if not recip_list: recip_list = []
    if recip: recip_list.append(recip)

    try:
        if getattr(settings, "EMAIL_USE_SSL", False):
            server = SMTP_SSL(settings.EMAIL_HOST, settings.EMAIL_PORT)
        else:
            server = SMTP(settings.EMAIL_HOST, settings.EMAIL_PORT)
        if settings.EMAIL_USE_TLS:
            server.ehlo()
            server.starttls()
            server.ehlo()
        if settings.EMAIL_HOST_USER and settings.EMAIL_HOST_PASSWORD:
            server.login(settings.EMAIL_HOST_USER,
                         settings.EMAIL_HOST_PASSWORD)
    except Exception, e:
        print e
        return
예제 #23
0
def htmlmail(sbj,recip,msg,template='',texttemplate='',textmsg='',images=(), recip_name='',sender=settings.DEFAULT_FROM_EMAIL,sender_name='',charset=charset):
   """
   if you want to use Django template system:
      use `msg` and optionally `textmsg` as template context (dict)
      and define `template` and optionally `texttemplate` variables.
   otherwise msg and textmsg variables are used as html and text message sources.

   if you want to use images in html message, define physical paths and ids in tuples.
   (image paths are relative to  MEDIA_ROOT)
   example:
   images=(('email_images/logo.gif','img1'),('email_images/footer.gif','img2'))
   and use them in html like this:
   <img src="cid:img1">
   ...
   <img src="cid:img2">
   """
   html=render(msg,template)
   if texttemplate or textmsg: text=render((textmsg or msg),texttemplate)
   else: text= html2text(_sanitizeHTML(html,charset))

   msgRoot = MIMEMultipart('related')
   msgRoot['Subject'] = sbj
   msgRoot['From'] = named(sender,sender_name)
   msgRoot['To'] =  named(recip,recip_name)
   msgRoot.preamble = 'This is a multi-part message in MIME format.'

   msgAlternative = MIMEMultipart('alternative')
   msgRoot.attach(msgAlternative)
   
   msgAlternative.attach(MIMEText(text, _charset=charset))
   msgAlternative.attach(MIMEText(html, 'html', _charset=charset))

   for img in images:
      fp = open(settings.MEDIA_ROOT+img[0], 'rb')
      msgImage = MIMEImage(fp.read())
      fp.close()
      msgImage.add_header('Content-ID', '<'+img[1]+'>')
      msgRoot.attach(msgImage)

   smtp = SMTP()
   smtp.connect(smtp_server)
   if smtp_user: smtp.login(smtp_user, smtp_pass)
   smtp.sendmail(sender, recip, msgRoot.as_string())
   smtp.quit()
예제 #24
0
    def convert(self,
                doc,
                encoding=None,
                mimetype=None,
                logError=False,
                raiseException=False):

        # convert to unicode
        if not isinstance(doc, unicode):
            if not encoding:
                mo = charset_reg.search(doc)
                if mo is not None:
                    encoding = mo.group(1)
                else:
                    encoding = 'ascii'  # guess
            doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        result = html2text(doc)

        # convert back to utf-8
        return result.encode('utf-8'), 'utf-8'
예제 #25
0
파일: result.py 프로젝트: zbw/zbw.ejSearch
    def quote(self, result):
        """
        """
        #TODO: only return quotes when searchterm is not in Title

        search = self.request.SearchableText
        
        if result.portal_type in ('JournalPaper'):
            result_text = html2text(result.getAbstract, 
                    ignore_tags=('a','span','br','p'))

        else:
            #Due to catalog out of sync, result.SearchableText sometimes returns
            #'Missing.Value', while result.getObject().SearchableText returns
            #as expected. So in that case we must get the object. We test this
            #with basestring (True for str and unicode)
            
            if hasattr(result, 'SearchableText'):
                if isinstance(result.SearchableText, basestring):
                    result_text = result.SearchableText
                else:
                    result_text = result.getObject().SearchableText()
            else:
                result_text = result.getObject().SearchableText()
                #TODO catch errors
                

        quotes = []
        quotes = set(quotes)
        search_text = self.request.SearchableText.split()
        
        for t in search_text:
            t = t.lower()
            t = t.replace('"', '')
            if t in result_text.lower():
                lines = re.split(r'\s*[!?.]\s*', result_text)
                for line in lines:
                    if t in line.lower() and len(quotes) < 3:
                        quotes.add(line)
        return quotes
예제 #26
0
def scolar_news_summary_rss(context, title, sco_url, n=5):
    """rss feed for scolar news"""
    news = scolar_news_summary(context,n=n)
    items = []
    for n in news:
        text = html2text(n['text'])
        items.append( PyRSS2Gen.RSSItem(
            title= unicode( '%s %s' % (n['rssdate'], text), SCO_ENCODING),
            link = sco_url + '/' + n['url'],
            pubDate = n['date822'] ))
    rss = PyRSS2Gen.RSS2(
        title = unicode(title, SCO_ENCODING),
        link = sco_url,
        description = unicode(title, SCO_ENCODING),
        lastBuildDate = datetime.datetime.now(),
        items = items )
    f = StringIO()
    rss.write_xml(f)
    f.seek(0)
    data = f.read()
    f.close()
    return data
예제 #27
0
파일: utils.py 프로젝트: amitu/dutils
def send_html_mail_nt(
    subject, sender=settings.DEFAULT_FROM_EMAIL, recip="", context=None, 
    html_template="", text_template="", sender_name="",
    html_content="", text_content="", recip_list=None, sender_formatted=""
):
    from stripogram import html2text
    from feedparser import _sanitizeHTML

    if not context: context = {}
    if html_template:
        html = render(context, html_template)
    else: html = html_content
    if text_template:
        text = render(context, text_template)
    else: text = text_content
    if not text:
        text = html2text(_sanitizeHTML(html,charset))        

    if not recip_list: recip_list = []
    if recip: recip_list.append(recip)

    try:
        if getattr(settings, "EMAIL_USE_SSL", False):
            server = SMTP_SSL(settings.EMAIL_HOST, settings.EMAIL_PORT)
        else:
            server = SMTP(settings.EMAIL_HOST, settings.EMAIL_PORT)
        if settings.EMAIL_USE_TLS:
            server.ehlo()
            server.starttls()
            server.ehlo()
        if settings.EMAIL_HOST_USER and settings.EMAIL_HOST_PASSWORD:
            server.login(
                settings.EMAIL_HOST_USER, settings.EMAIL_HOST_PASSWORD
            )
    except Exception, e: 
        print e
        return
예제 #28
0
from stripogram import html2text
import sys
import httplib2


if __name__ == '__main__':
    http = httplib2.Http()
    headers, body = http.request(sys.argv[1])
    ctype = headers['content-type']
    charset = ctype[ctype.index('charset=')+8:]
    body = body.decode('UTF-8')
    try:
        text = html2text(body.encode('utf8', 'ignore'))
    except UnicodeDecodeError:
        text = html2text(body)
    print text
예제 #29
0
            messages.append(source_message)

print messages

for item in messages:
    for destination in sync.destination.all():
        print destination.sn_type
        if destination.sn_type.code == 'vk' and destination.enabled:
            vk = VK(vk_settings)
            attachments = []
            message = ""
            if 'title' in item:
                message = "%s" % (item['title']) #, html2safehtml(item['text'])
            if 'text' in item:
                try:
                    message+="   %s" % html2text(item['text'])
                except Exception:
                    message+="   %s" % strip_tags(item['text'])

            if item['attachements']:
                for attach in item['attachements']:
                    attachments.append(attach['src'])
            if message:
                res = vk.VKPost(destination, message, attachments)
                try:
                    js = json.loads(res)
                    if 'error' in js:
                        raise Exception, 'error found in vk responce'
                    else:
                        print res
                except Exception:
예제 #30
0
def html_to_text(url):
    myurl = urllib.urlopen(url)
    html_string = myurl.read()
    text = html2text(html_string)
    return text
예제 #31
0
def entry_upload(request, entry_id):
    if request.user.is_anonymous():
        return HttpResponseRedirect('/login/')
    entry = get_object_or_404(models.Entry, pk=entry_id)
    challenge = entry.challenge

    is_member = request.user in entry.users.all()
    if not is_member or not entry.isUploadOpen():
        messages.error(request, "You're not allowed to upload files!")
        return HttpResponseRedirect('/e/%s/'%entry_id)

    if request.method == 'POST':
        f = FileForm(request.POST, request.FILES)
    else:
        f = FileForm()

    info = {
        'challenge': challenge,
        'entry': entry,
        'files': entry.file_set.all(),
        'is_member': True,
        'is_owner': True,
        'form': f,
    }

    # just display the form?
    if not f.is_valid():
        return render_to_response('challenge/entry_file.html', info,
            context_instance=RequestContext(request))

    # make sure user isn't sneeeky
    if f.cleaned_data['is_final'] and not challenge.isFinalUploadOpen():
        f._errors['is_final'] = f.error_class(["Final uploads are not allowed now."])
        return render_to_response('challenge/entry_file.html', info,
            context_instance=RequestContext(request))

    # avoid dupes
    if os.path.exists(os.path.join(MEDIA_ROOT, str(challenge.number), entry.name,
            request.FILES['content'].name)):
        f._errors['content'] = f.error_class(["File with that filename already exists."])
        return render_to_response('challenge/entry_file.html', info,
            context_instance=RequestContext(request))

    file = models.File(
        challenge=challenge,
        entry=entry,
        user=request.user,
        created=datetime.datetime.utcnow(),
        content=request.FILES['content'],
        description=html2text(f.cleaned_data['description']),
        is_final=f.cleaned_data['is_final'],
        is_screenshot=f.cleaned_data['is_screenshot'],
        thumb_width=0,
    )
    file.save()
    if file.is_final:
        entry.has_final = True
        entry.save()

    if file.is_screenshot:
        try:
            _make_thumbnail(file)
        except:
            # XXX need feedback with custom error "file is not an image"
	    messages.error(request, 'File is not an image')
            return render_to_response('challenge/entry_file.html', info,
                context_instance=RequestContext(request))

    messages.success(request, 'File added!')
    return HttpResponseRedirect('/e/%s/'%entry_id)
예제 #32
0
                        if msgparttype == 'text/plain':
                            emailTEXT = payload
                        elif msgparttype == 'text/html':
                            emailHTML = payload
                else:
                    payload = msg.get_payload(decode=True)
                    if contenttype[0] == 'text/plain':
                        emailTEXT = payload
                    elif contenttype[0] == 'text/html':
                        emailHTML = payload

                # If not TEXT version exists, convert HTML version to TEXT
                if not emailTEXT:
                    if emailHTML:
                        emailTEXT = stripogram.html2text(
                            emailHTML.decode(
                                "utf-8", "replace").encode("utf-8")).lstrip()

                # Get Yahoo Term Extractor generated terms
                # Results in UnicodeDecodeError on some emails which can be ignored similar to this
                # http://code.djangoproject.com/attachment/ticket/1086/feeds.py.2.diff
                # or replace them similar to https://bugs.launchpad.net/gpodder/+bug/252506
                keywords = None
                if emailTEXT:
                    keywords = termExtraction(appid, emailTEXT)[-5:]

                # Encode the emailTEXT and emailHTML for insertion into sqlite newer versions
                if emailTEXT:
                    emailTEXT = emailTEXT.decode('utf-8', 'replace')
                if emailHTML:
                    emailHTML = emailHTML.decode('utf-8', 'replace')
예제 #33
0
def get_context(lines, i, nmax):
    c = html2text(string.join(lines[i:i + 5]))
    c = c.replace("\n", " ")
    c = ' '.join(c.split())
    c = c[:nmax]
    return c
예제 #34
0
def upload_award(request, entry_id):
    creator = request.user

    if creator.is_anonymous():
        return HttpResponseRedirect('/login/')
    entry = get_object_or_404(models.Entry, pk=entry_id)
    challenge = entry.challenge

    is_member = creator in entry.users.all()
    if is_member:
        messages.error(request, 'You cannot give an award to your own entry!')
        return HttpResponseRedirect('/e/%s/' % entry_id)

    info = dict(
        challenge=challenge,
        entry=entry,
        awards=creator.award_set.all(),
        give_form=GiveAwardForm(creator),
    )
    errors = None

    if request.method == 'POST':
        f = UploadAwardForm(request.POST, request.FILES)
    else:
        f = UploadAwardForm()
    info['upload_form'] = f

    # Display form
    if not f.is_valid():
        return render(request, 'challenge/upload_award.html', info)

    # make sure the filename is unique


#    if os.path.exists(fspath):
#        error = 'You have already uploaded an award image with that filename.'

# check dimensions of image
    ok = False
    try:
        image = Image.open(request.FILES['content'])
        if image.size == (64, 64):
            ok = True
    except:
        pass
    if not ok:
        messages.error(request, 'The image could not be read or is not 64x64')
        return render(request, 'challenge/upload_award.html', info)

    # Write award image to disk
    award = models.Award(
        creator=creator,
        content=request.FILES['content'],
        description=html2text(f.cleaned_data['description']),
    )
    award.save()

    if _give_award(challenge, creator, entry, award):
        messages.success(request, 'Award given!')
    else:
        messages.error(request, 'This entry already has that award.')

    return HttpResponseRedirect('/e/%s/' % entry_id)
예제 #35
0
    opfilepath = opdirpath + allNames[len(allNames) - 1]
    soup = None
    try:
        soup = bs(open(filepath))
        soup = soup.body
    except Exception:
        print("cannot open:" + filepath, file=error)
        continue

    if not soup:
        print("contains no body:" + filepath, file=error)
        continue

    heading = soup.find('h1', class_='firstHeading')
    if heading:
        heading = html2text(str(heading)).strip()

    paragraphlist = soup.findAll('p')

    if paragraphlist or heading:
        try:
            opfile = open(opfilepath, 'w+')
        except Exception:
            print("cannot create o/p filename:" + opfilepath, file=error)
            continue
    else:
        continue

    if heading:
        print(heading, file=opfile)
        print(heading, file=opfile)
예제 #36
0
def get_text2(url):
    from stripogram import html2text
    r = requests.get(url)
    text = html2text(r.content).encode('utf-8')
    #print text
    return text
예제 #37
0
import urllib
 
from stripogram import html2text
 
myurl = urllib.urlopen("http://tuxworld.wordpress.com")
 
html_string = myurl.read()
 
text = html2text( html_string )
 
print text
예제 #38
0
def entry_display(request, entry_id):
    entry = get_object_or_404(models.Entry, pk=entry_id)
    challenge = entry.challenge
    user_list = entry.users.all()
    is_member = request.user in list(user_list)
    files = entry.file_set.filter(
        is_screenshot__exact=True).order_by("-created")[:1]
    thumb = None
    if files: thumb = files[0]

    # handle adding the ratings form and accepting ratings submissions
    f = None
    if entry.may_rate(request.user, challenge) and challenge.isRatingOpen():
        errors = {}

        # get existing scores
        rating = None
        for rating in entry.rating_set.filter(user__id__exact=request.user.id):
            break

        # fields for rating editing
        if request.method == 'POST':
            f = RatingForm(request.POST)
            if f.is_valid():
                if rating is not None:
                    # edit existing
                    rating.disqualify = f.cleaned_data['disqualify']
                    rating.nonworking = f.cleaned_data['nonworking']
                    rating.fun = f.cleaned_data['fun']
                    rating.innovation = f.cleaned_data['innovation']
                    rating.production = f.cleaned_data['production']
                    rating.comment = html2text(f.cleaned_data['comment'])
                else:
                    # create new
                    rating = models.Rating(
                        entry=entry,
                        user=request.user,
                        disqualify=f.cleaned_data['disqualify'],
                        nonworking=f.cleaned_data['nonworking'],
                        fun=f.cleaned_data['fun'],
                        innovation=f.cleaned_data['innovation'],
                        production=f.cleaned_data['production'],
                        comment=html2text(f.cleaned_data['comment']),
                    )
                rating.save()
                messages.info(request, 'Ratings saved!')
                return HttpResponseRedirect("/e/%s/" % entry.name)
        elif rating is not None:
            data = dict(disqualify=rating.disqualify,
                        nonworking=rating.nonworking,
                        fun=rating.fun,
                        innovation=rating.innovation,
                        production=rating.production,
                        comment=rating.comment)
            f = RatingForm(data)
        else:
            f = RatingForm()

    rating_results = False
    if challenge.isAllDone() and entry.has_final:
        # display ratings
        d = rating_results = entry.tally_ratings()
        d['dp'] = '%d%%' % (d.get('disqualify', 0) * 100)
        d['dnwp'] = '%d%%' % (d.get('nonworking', 0) * 100)

    return render_to_response('challenge/entry.html', {
        'challenge': challenge,
        'entry': entry,
        'files': entry.file_set.all(),
        'thumb': thumb,
        'diary_entries': entry.diary_entries(),
        'is_user': not request.user.is_anonymous(),
        'is_member': is_member,
        'is_team': len(user_list) > 1,
        'is_owner': entry.user == request.user,
        'form': f,
        'rating': rating_results,
        'awards': entry.entryaward_set.all(),
    },
                              context_instance=RequestContext(request))
예제 #39
0
# webExtractor
# Created by JKChang
# 30/05/2017, 21:39
# Tag:web extractor
# Description:
import urllib.request, urllib.error, urllib.parse
import requests
from bs4 import BeautifulSoup
from stripogram import html2text
import re

url = 'http://www.w3resource.com/python-exercises/python-basic-exercises.php'
html = requests.get(url).text
text = html2text(html)
l = text.encode('utf-8').split('\n')
index = 1

pat = '(/d+)/. '
for line in l:
    # if len(line) ==0:
    #     continue
    # elif line[0].isdigit():
    print(line)

# if line[0].isdigit():
#        print line.encode('utf-8')

# print html.encode('utf-8')
#
# soup = BeautifulSoup(html,'lxml')
# res = soup.findAll("article", {"class": "listingItem"})
예제 #40
0
	n = re.sub('<(\w+@\w+(?:\.\w+)+)>','',m)
	remitente = n
	##print (remitente)
	##print ("------------------------------------------")
	typ,asunto =  M.fetch(num,'(BODY[HEADER.FIELDS (SUBJECT)])')
	#Quitamos la palabra "Subject" por un espacio en blanco
	m = re.sub('^Subject: ','',asunto[0][1])
	textoasunto = m
	##print (textoasunto)
	##print ("------------------------------------------")
	typ,cuerpo = M.fetch(num,'(BODY[TEXT])')
	texto = cuerpo[0][1]
	#la parte del update
	propietarioI=remitente
	emailpropietarioI=direccion
	asuntoI=textoasunto
	#Esto hace que el cuerpo se pueda insertar de manera correcta...
	#textoI= unicode(texto,"latin-1")
	textoI2=unicode(html2text(texto,ignore_tags=("img",),indent_width=4,page_width=80),"latin-1")
	fechaI=datetime.datetime.now()
	#
	#Insertar la incidencia como pendiente , mirar siempre que sea relativa al estado que toda de los pendientes...
	#
	relestado_idI='1'
	c.execute("INSERT INTO gestorincidencias_incidencias (propietario,emailpropietario,asunto,texto,fecha,relestado_id) VALUES (%s,%s,%s,%s,%s,%s)",(propietarioI,emailpropietarioI,asuntoI,textoI2,fechaI,relestado_idI))
	conn.commit()
	M.store(num, '+FLAGS', '\\Deleted')
M.expunge()
M.close()
M.logout()
                      payload = msgpart.get_payload(decode=True)
                      if msgparttype == 'text/plain':
                         emailTEXT = payload
                      elif msgparttype == 'text/html':
                         emailHTML = payload
               else:
                  payload = msg.get_payload(decode=True)
                  if contenttype[0] == 'text/plain':
                     emailTEXT = payload
                  elif contenttype[0] == 'text/html':
                     emailHTML = payload
                     
               # If not TEXT version exists, convert HTML version to TEXT
               if not emailTEXT:
                  if emailHTML:
                     emailTEXT = stripogram.html2text(emailHTML.decode("utf-8","replace").encode("utf-8")).lstrip()

               # Get Yahoo Term Extractor generated terms
               # Results in UnicodeDecodeError on some emails which can be ignored similar to this
               # http://code.djangoproject.com/attachment/ticket/1086/feeds.py.2.diff
               # or replace them similar to https://bugs.launchpad.net/gpodder/+bug/252506
               keywords = None
               if emailTEXT:
                  keywords = termExtraction(appid, emailTEXT)[-5:]
                  
               # Encode the emailTEXT and emailHTML for insertion into sqlite newer versions
               if emailTEXT:
                  emailTEXT = emailTEXT.decode('utf-8', 'replace')
               if emailHTML:
                  emailHTML = emailHTML.decode('utf-8', 'replace')
예제 #42
0
def test():
    gp = {}
    for d in mdb.deputes.find({},{'stats.election':1,'groupe_abrev':1}):
        g = d['groupe_abrev']
        if not g in gp.keys():
            gp[g] = []
        gp[g].append(d['stats']['election']['inscrits'])
    moy = {}
    import numpy
    for g,v in gp.iteritems():
        moy[g] = numpy.median(numpy.array(v))
    return json_response(moy)
    stats = dict(groupe=0,dissidence=0,abstention=0)
    for s in mdb.scrutins.find({'scrutin_num':{'$nin':[404,405,406]}},{'scrutin_positions':1}):
        spos = s['scrutin_positions']['REM']
        for pos in ['pour','contre','abstention']:
            if pos in ['pour','contre']:
                if spos['position']!=pos:
                    stats['dissidence'] += spos.get(pos,0)
                else:
                    stats['groupe'] += spos.get(pos,0)
            else:
                stats[pos] += spos.get(pos,0)

    return json_response(stats)
    from obsapis.tools import parse_content
    import requests
    from lxml import etree
    url = "http://www.assemblee-nationale.fr/15/dossiers/dons_jours_repos_aidants_familiaux.asp"
    #url = "http://www.assemblee-nationale.fr/15/dossiers/soutien_collectivites_accueil_gens_voyage.asp"
    r = requests.get(url)
    xml = parse_content(r.content)
    print xml.xpath('//a[text()[contains(.,"Proposition de loi")]]/@href')
    from stripogram import html2text, html2safehtml
    doc  = html2text(r.content,page_width=10000).decode('iso8859-1').split(u'\n\n')
    start = False
    bloc = ""
    done = False
    for i,l in enumerate(doc):
        l = l.replace(u'1ère',u'première')
        if 'Proposition de loi' in l:
            start = True
            done = False
        elif len(l)<4:
            if start == True:
                start = False
                done = True

        if start:
            bloc += l
        if done:
            print bloc
            done = False
            start = False
            m1 = re.search(r'n. *([0-9]+).*d\xe9pos\xe9e? le ([0-9]+ [^ ]+ [0-9]+).*mis en ligne le ([0-9]+ [^ ]+ [0-9]+).*renvoy\xe9e? \xe0 (.*)',bloc)
            #m1 = re.search(r'n° *([0-9]+).*d\xe9pos\xe9e? le ([0-9]+ [^ ]+ [0-9]+).*mis en ligne le ([0-9]+ [^ ]+ [0-9]+).*renvoy\xe9e? \xe0 (.*)',bloc)
            #print m1
            if m1:
                print m1.groups()
            bloc = ""



    return "ok"

    gps = {}
    import datetime
    return json_response(mdb.amendements.find_one({'auteurs':None}))
    for d in mdb.deputes.find({'depute_actif':True},{'depute_ddn':1,'groupe_abrev':1,'groupe_libelle':1}):
        age = (datetime.datetime.now()-datetime.datetime.strptime(d['depute_ddn'],'%d/%m/%Y')).days/365.25
        gps[d['groupe_libelle']] = gps.get(d['groupe_libelle'],[]) + [age]
    from numpy import median,average
    for k,v in gps.iteritems():
        print v
        print "%s - moyenne : %.2f, mediane : %.2f" % (k,average(v),median(v))
    #for i,d in enumerate(mdb.documentsan.find()):
    #    d['contenu'] = d['titre'] + d.get('contenu','')
    #    mdbrw.documentsan.update_one({'id':d['id']},{'$set':{'contenu':d['contenu']}})
    #    print i
    #import_amendements()
    return json_response(gps)
    #mdbrw.travaux.remove({'sort':'44'})
    #update_travaux()
    #return json_response(list(mdb.travaux.find({'sort':'44'})))

    #return json_response(mdb.questions.find_one({}))

    #import_qag()
    return json_response(mdb.travaux.find_one())
    #return json_response(mdb.interventions.find({'itv_rapporteur':None})))
    #return json_response(mdb.interventions.find({'itv_rapporteur':None}).distinct('itv_date'))
    #return json_response(mdb.interventions.find({'$and':[{'itv_rapporteur':True},{'depute_shortid':'ericcoquerel'}]}))
    from obsapis.controllers.admin.updates.interventions import update_stats_interventions
    deppdp  = {}
    #return json_response(update_stats_interventions())
    for pdp in update_stats_interventions():

        dep = pdp['_id'].get('depute',None)
        if dep:
            if not dep in deppdp.keys():
                deppdp[dep]= dict(n=0,rap=0)
            deppdp[dep]['rap' if pdp['_id']['rapporteur'] else 'n'] += pdp['n']

    return json_response(', '.join('%d. %s (%d)' % (i+1,d[0],d[1]['n']+d[1]['rap']) for i,d in enumerate(sorted(deppdp.items(),key=lambda x:x[1]['n']+x[1]['rap'],reverse=True))))
    counts = {}
    nbmembres = dict((g['groupe_abrev'],g['groupe_nbmembres']) for g in mdb.groupes.find({},{'groupe_abrev':1,'groupe_nbmembres':1}))
    for q in mdb.questions.find({'groupe':{'$ne':None}},{'groupe':1}):
        g = q['groupe']
        if not g in counts.keys():
            counts[g] = 0
        counts[g] += 1
    return json_response([ "%s (%d)" % (g,n/nbmembres[g]) for g,n in sorted(counts.items(),key=lambda x:x[1]/nbmembres[x[0]],reverse=True)])
    col = []
    for d in mdb.deputes.find({},{'depute_collaborateurs_hatvp':1,'_id':None,'depute_shortid':1}):
        col.append((d['depute_shortid'],len(d.get('depute_collaborateurs_hatvp',[]))))
    return json_response(sorted(col,key=lambda x:x[1],reverse=True)[:20])

    import datetime
    #mdbrw.deputes.update_one({'depute_shortid':'michelevictory'},{'$unset':{'stats.commissions':""}})
    return json_response(mdb.deputes.find_one({},{'depute_hatvp':1}))
    return json_response([d['depute_shortid'] for d in mdb.deputes.find({'stats.commissions.present':0.0})])
    #{'$and': [{'depute_actif': True}, ]} [('stats.nonclasse', 1), ('stats.ranks.down.exprimes', 1)]
    return json_response(list(d['depute_shortid'] for d in mdb.deputes.find({'depute_mandat_debut':{'$gte':datetime.datetime(2017,5,21)}},{'depute_shortid':1})))

    return json_response([d['depute_shortid'] for d in mdb.deputes.find({'$and':[{'$or':[{'depute_actif': True},{'depute_shortid':'michelevictory'}]},{u'stats.positions.exprimes': {'$ne': None}}]}).sort([('stats.nonclasse', 1), ('stats.ranks.down.exprimes', 1)]).limit(5)])
    for d in mdb.deputes.find({'depute_election':None}):
        circo = d['depute_circo_id']
        titulaire = mdb.deputes.find_one({'$and':[{'depute_circo_id':circo},{'depute_election':{'$ne':None}}]})
        mdbrw.deputes.update_one({'depute_shortid':d['depute_shortid']},{'$set':{'depute_election':titulaire['depute_election']}})
    return "oj"
    #mdbrw.questions.update_many({'legislature':None},{'$set':{'legislature':15}})
    #update_travaux()
    #return json_response(mdb.interventions.find_one({}))
    return json_response(list(q['itv_contenu_texte'] for q in mdb.interventions.find({'depute_shortid':'mariechristineverdierjouclas'})))

    return json_response(mdb.travaux.distinct('type'))

    #for a in mdb.amendements.find({'suppression':True},{'id':1}):
    #    mdbrw.travaux.update_many({'idori':a['id']},{'$set':{'suppression':True}})

    #mdbrw.travaux.remove({'idori':'S-AMANR5L15PO419610B155N7'})
    #mdbrw.amendements.remove({'id':{'$in':amdlist}})
    #mdbrw.travaux.remove({'idori':{'$in':amdlist}})
    #import_amendements()


    return json_response(list(q['description'] for q in mdb.travaux.find({'$and':[{'auteur':{'$ne':False}},{'type':'QE'},{'depute':'francoisruffin'}]})))

    return json_response(list(mdb.travaux.find({'idori':'S-AMANR5L15PO419610B155N7'})))


    print mdb.travaux.count()
    return json_response(list(t['description'] for t in mdb.travaux.find({'groupe':'FI'})))

    #updateDeputesTravaux()

    #importdocs()


    #import_qag()

    return json_response(mdb.deputes.find_one({'depute_shortid':'francoisruffin'}))
    #importdocs()

    #return json_response(mdb.documentsan.find_one({'$and':[{'typeid':'propositionderesolution'},{'cosignataires.id':'francoisruffin'}]}))
    ops = []
    pgroup = {'n':{'$sum':1}}
    pgroup['_id'] = {'depute':'$auteurs'}

    pipeline = [{'$match':{}}, {'$unwind':'$auteurs'},{"$group": pgroup }] #'scrutin_typedetail':'amendement'
    return json_response(sum(d['n'] for d in mdb.documentsan.aggregate(pipeline)))
    print len(list(mdb.documentsan.aggregate(pipeline))),mdb.documentsan.count()


    #return json_response(mdb.amendements.find({'suppression':True},{'dispositif':1}).count())
    #mdbrw.scrutins.update_one({'scrutin_num':324},{'$set':{'scrutin_liendossier':'http://www.assemblee-nationale.fr/15/dossiers/deuxieme_collectif_budgetaire_2017.asp'}})
    #return json_util.dumps(list(mdb.amendements.find({'numAmend':'426'})))
    #mdbrw.scrutins.update_one({'scrutin_num':1},{'$set':{'scrutin_groupe':'Gouvernement','scrutin_lientexte':[(u'déclaration de politique générale',
    #                                                                          'http://www.gouvernement.fr/partage/9296-declaration-de-politique-generale-du-premier-ministre-edouard-philippe',
    #
    #mdbrw.votes.update_many({'scrutin_num':1},{'$set':{'scrutin_groupe':'Gouvernement'}})


    #return json_response([ (d['depute_shortid'],d['depute_mandat_fin_cause']) for d in mdb.deputes.find({'depute_actif':False},{'depute_shortid':1,'depute_mandat_fin_cause':1,'_id':None})])
    #mdbrw.scrutins.update_one({'scrutin_num':357},{'$set':{'scrutin_lientexte.0.1':'http://www.assemblee-nationale.fr/15/dossiers/jeux_olympiques_paralympiques_2024.asp#'}})
    #return json_response(mdb.scrutins.find_one({'scrutin_num':357}))
    return json_response(mdb.documentsan.distinct('type'))

    # visuels
    pgroup = {}
    pgroup['n'] = {'$sum':1}
    pgroup['_id'] = { 'depute':'$depute'}
    pipeline = [{'$match':{'name':'visuelstat'}},{'$group':pgroup}]
    vdeps = []
    for g in mdb.logs.aggregate(pipeline):
        _g = g['_id']['depute']
        if _g != None:
            vdeps.append((_g,g['n']))

    return ", ".join([ "%s (%s)" % i for i in sorted(vdeps,key=lambda x:x[1],reverse=True)])

    #updateDeputesContacts()
    return json_util.dumps(mdb.deputes.find_one({'depute_shortid':'nicolelepeih'},{'depute_contacts':1,'_id':None}))
    #importdocs()
    #return json_util.dumps(list(mdb.logs.find({'name':'visuelstat'})))
    mts = list(mdb.scrutins.find({ '$text': { '$search': "rejet" } },{'scrutin_groupe':1,'scrutin_fulldesc':1,'scrutin_sort':1,'_id':None}))
    _mts = "\n".join([";".join([m.get('scrutin_groupe',''),m['scrutin_sort'],m['scrutin_fulldesc']]) for m in mts])
    print _mts

    return json_util.dumps(mdb.deputes.find_one({'depute_shortid':'thierrysolere'},{'stats':1,'_id':None}))

    return json_util.dumps([(d['depute_nom'],
                             d['stats']['positions']['exprimes'],
                             d['stats']['votesamdements']['pctpour'],
                             d['depute_shortid']) for d in mdb.deputes.find({'groupe_abrev':'REM','stats.positions.exprimes':{'$gt':20}},{'depute_nom':1,'depute_shortid':1,'stats.positions.exprimes':1,'stats.votesamdements.pctpour':1}).sort([('stats.votesamdements.pctpour',-1)]).limit(20)])
    from fuzzywuzzy import fuzz
    sdesc = [(s['scrutin_dossier'],s['scrutin_dossierLibelle'],s['scrutin_desc'][20:]) for s in mdb.scrutins.find({'scrutin_dossier':{'$ne':'N/A'}},{'scrutin_dossier':1,'scrutin_dossierLibelle':1,'scrutin_desc':1,'_id':None})]
    r = []
    for s in mdb.scrutins.find({'scrutin_dossier':'N/A'},{'scrutin_desc':1,'_id':None,'scrutin_id':1}):
        for dos,doslib,d in sdesc:
            fz = fuzz.partial_ratio(s['scrutin_desc'][20:],d)
            if fz>97:
                r.append((s['scrutin_id'],dos,doslib))
                break

    return json_util.dumps(r)
    return json_util.dumps([(d['depute_shortid'],d['depute_suppleant'],d['depute_mandat_fin']) for d in mdb.deputes.find({'depute_actif':False})])
    return json_util.dumps(list(mdb.amendements.find({'sort':u"Adopt\u00e9","signataires_groupes":{'$elemMatch':{'$eq':'FI'}}},{'_id':None,'numInit':1,'numAmend':1})))
예제 #43
0
    def getDossier(url):
        ops = []
        r = requests.get(url)
        texte = "NOPE"
        doc = html2text(r.content,
                        page_width=10000).decode('iso8859-1').split(u'\n\n')
        for i, l in enumerate(doc):
            l = l.replace(u'1ère', u'première').replace(u'2e ', u'deuxième ')
            #print l
            search = False
            if l[0:21] == u'Assemblée nationale -':
                if l[22:38] == u'première lecture' or l[
                        22:38] == u'Nouvelle lecture':
                    _l = l[39:]
                    j = 0
                    while not (u"proposition de loi" in _l.lower()
                               or u'projet de loi' in _l.lower() or j > 5):
                        j += 1
                        _l = doc[i + j].replace(u'1ère', u'première').replace(
                            u'2e ', u'deuxième ')

                    texte = _l.split(u',')[0]
                else:
                    _l = l
                search = True
                lecture = u' '.join(l[22:].split(' ')[0:2])
            if l[0:21] == u'Travaux préparatoires':
                j = 0
                while not (u"proposition de résolution" in _l.lower()
                           or j > 5):
                    j += 1
                    _l = doc[i + j].replace(u'1ère', u'première')
                texte = _l.split(u',')[0]
                search = True
                lecture = ""
            if l[0:26] == u"Commission Mixte Paritaire":

                j = 0
                m = None

                while not m and j <= 5:
                    j += 1
                    m = re.search(
                        u"sous le n° ([0-9]+) +à l'Assemblée nationale",
                        doc[i + j])
                n = m.groups()[0] if m else None
                if n and n in docsan.keys():
                    ops.append(
                        (texte, "texte de la commission mixte paritaire",
                         docsan[num], num, docsan[n], n))
                    #print (texte,"",docsan[num])
            if search:
                m = re.search(u", *(TA|) +n *° *([0-9]+)[^\-]* *", _l)
                if m:
                    if m.groups()[0] == 'TA':
                        num = "TA%04d" % int(m.groups()[1])
                    else:
                        num = m.groups()[1]
                    ops.append((texte, lecture, docsan[num], num))

                    #print (texte,lecture,docsan[num])
        return ops
예제 #44
0
    opfilepath = opdirpath + allNames[len(allNames)-1]
    soup = None
    try:
        soup = bs(open(filepath))
        soup = soup.body
    except Exception :
        print("cannot open:" + filepath, file=error);
        continue;

    if not soup:
        print("contains no body:" + filepath , file = error)
        continue;

    heading = soup.find('h1',class_='firstHeading')
    if heading:
        heading = html2text(str(heading)).strip()

    paragraphlist = soup.findAll('p')



    if paragraphlist or heading:
        try:
            opfile = open(opfilepath, 'w+')
        except Exception :
            print("cannot create o/p filename:"+opfilepath, file=error)
            continue;
    else:
        continue;

    if heading:
import sys
import json

def get_words(text):
    return re.compile('\w+').findall(text)

if __name__ == "__main__":

    f_in = open(sys.argv[1])
    f_out = open(sys.argv[2], 'w')
    pages = json.load(f_in)
    json_doc = []
    df = defaultdict(int)
    for page in pages:
        try:
            text = html2text(page['html'].encode('utf8', 'ignore'))
        except UnicodeDecodeError:
            text = html2text(page['html'])
        words = get_words(text)
        n = len(words)
        tf = defaultdict(int)
        for word in words: tf[word] +=1
        for word in tf: tf[word] /= float(n)
        json_doc.append({'url': page['url'], 'features': tf})
        words_set = set(words)
        for word in words_set: df[word] +=1
    n = len(pages)
    for word in df: df[word] /= float(n)
    for row in json_doc:
        features = row['features']
        for word in features:
예제 #46
0
 def summary(self):
     ''' summary text - remove HTML and truncate '''
     text = html2text(self.content)
     if len(text) > 255:
         text = text[:252] + '...'
     return text
예제 #47
0
	def obtemTexto(self):
	  self.conteudo=self.removerEspacos()
	  self.conteudo=self.removerTagsHTML()

	  return html2text(self.conteudo)
예제 #48
0
def get_feature_vector(words, df):
    tf = defaultdict(int)
    for word in words: tf[word] += 1
    n = len(words)
    for word in tf: tf[word] /= float(n)
    res = defaultdict(int)
    for word in tf:
        if word in df:
            res[word] = tf[word]/float(df[word])
    return res

if __name__ == '__main__':
    url, model = sys.argv[1], sys.argv[2]

    page = urlopen(url).read()
    try:
        text = html2text(page.encode('utf8', 'ignore'))
    except UnicodeDecodeError:
        text = html2text(page)

    dataset = json.load(open(model))
    texts = dataset[:-1]
    df = dataset[-1]
    vec = get_feature_vector(get_words(text), df)
    print vec
    # similarity_list = []
    # for text in texts:
    #     similarity_list.append((cos(vec, text['features']), text['url']))
    #
    # similarity_list.sort(reverse=True)
    # for entry in similarity_list
예제 #49
0
        return False


if __name__ == '__main__':
    f = open('uk2002-spamlabels1.txt')
    f1 = open('uk2002-working.txt', "wb")
    line = f.readline()
    i = 0
    while line:
        print i
        i = i + 1
        space = line.split()
        site = 'http://' + space[0]
        ##print site;
        ##print checkUrl(site)
        now = time.time()
        if (checkUrl(site)):
            later = time.time()
            difference = int(later - now)
            try:
                with Timeout(5):
                    file("docs/" + space[0] + ".txt",
                         "w").write(html2text(urllib2.urlopen(site).read()))
                print site + " " + str(difference)
                f1.write(site + " " + space[1] + " " + str(difference) + "\n")
            except:
                print(site + " " + "non-responsive\n")
        line = f.readline()
    f.close()
    f1.close()