示例#1
0
def put_daily(cur, date):
    put_counts(cur, date)
    put_actives(cur, date, False)
    #if date.weekday() == weekday_saturday:
    #    put_actives(cur, date, True)
    put_crashes(cur, date)
    summarize(date)
示例#2
0
def index():
    errors = ""
    if request.method == 'GET':
        return render_template("index.html",errors= errors)
    else :
        query = ""
        query = request.form['name']
        if query == "":
            return "No Query "
        url = "http://en.wikipedia.org/wiki/"+ query.lower()
        '''
        word = Word(query)
        word.synsets[:5]
        defi = word.definitions[0]()
        '''
        defi = ""
        summary = ""
        if defi :
            summary += defi

        text = summarize.summarize(url, query.lower())
        summary += text
        '''word = summary.split()
        sent = ""
        for w in word :
            sent += word + "%20"'''

        

        return render_template("index.html",summary = summary)
示例#3
0
def main(argv):
    control = make_control(argv)
    sys.stdout = Logger.Logger(base_name=control.arg.base_name)
    print control

    in_df = pd.read_csv(control.path_in,
                        nrows=1000 if control.test else None,
                        )
    summary_df = summarize.summarize(in_df)
    report_summary = make_report(summary_df)
    # TODO: print correlations of each variable with price

    print summary_df

    # write output files
    summary_df.to_csv(control.path_out_summary)

    f = open(control.path_out_report, 'wb')
    pickle.dump((report_summary, control), f)
    f.close()

    if control.test:
        print 'DISCARD OUTPUT: TESTING'

    print control
    print 'done'
示例#4
0
	def GET(self,key):
		#文件名
		fname=cwd("static","files", "cluster",key)
		res={}
		import os,json
		if not os.path.isfile(fname):
			return json.dumps({"error":"file not found"})
		web.header('Content-Type', 'application/json')
		sentence=file(fname,'r').read()
		tags=jieba.analyse.extract_tags(sentence,10)
		words = jieba.cut(sentence)
		freq = {}
		total=0.0
		#todo:从文件导入停用词
		stop_words= set([
		"where","the","of","is","and","to","in","that","we","for","an","are","by","be","as","on","with","can","if","from","which","you","it","this","then","at","have","all","not","one","has","or","that"
		])
		#统计词频
		for w in words:
		    if len(w.strip())<2: continue
		    if w.lower() in stop_words: continue
		    freq[w]=freq.get(w,0.0)+1.0
		    total+=freq[w]
		tags=dict([(x,freq[x])  for x in tags])
		summary=summarize.summarize(sentence)
		#\n换成<br>,为了在html中显示
		summary=summary.replace('\n',"<br>")
		# print summary
		return json.dumps({"keyword":tags,"summary":summary})
示例#5
0
 def test_that_it_runs(self):
     text = summarize(
         "Alice and Bob are friends. Alice is fun and cuddly."
         " Bob is cute and quirky. Together they go on wonderful"
         " adventures in the land of tomorrow. Alice's cuddlines"
         " and Bob's cuteness allow them to reach their goals."
         " But before they get to them, they have to go past their"
         " mortal enemy — Mr. Boredom. He is ugly and mean. They"
         " will surely defeat him. He is no match for their abilities.")
     self.assertTrue(bool(text))
示例#6
0
 def test_when_there_arent_any_words_in_common(self):
     text = (
         "Alice is awesome. I'm hot and you're not. This is pretty sick. "
         "We are all divisive. Nothing common between these sentences. "
         "And here's one more example of that happening."
     )
     summary = summarize(text)
     self.assertEqual(
         summary,
         "Alice is awesome. I'm hot and you're not. This is pretty sick. "
         "We are all divisive. Nothing common between these sentences."
     )
def index():
    text = request.forms.getunicode('text')
    number = int_or_none(request.forms.get('number'))
    language = request.forms.get('language') or 'english'
    result = summarize(text, number, language) if number and text else None
    return {
        'text': text or DEMO_TEXT,
        'result': result,
        'number': number or 5,
        'language': language,
        'available_languages': LANGUAGES
    }
示例#8
0
def main():
    # Create a parser object to handle passing arguments through the command line
    parser = argparse.ArgumentParser(description='TextNow Coffee Tasting')
    subparsers = parser.add_subparsers(dest='command', help='command')

    # Define 3 parser objects for the 3 different operations that are available
    commands = ['parse', 'summarize', 'recommend']
    parsers = {c: subparsers.add_parser(c) for c in commands}
    parsers['parse'].add_argument('arg', help='coffee descriptive name')
    parsers['summarize'].add_argument('arg', help='input csv file',
                                      type=argparse.FileType('r'))
    parsers['recommend'].add_argument('arg', help='input csv file',
                                      type=argparse.FileType('r'))
    args = parser.parse_args()
    
    if args.command == 'parse':
        coffee = Coffee.fromname(name=args.arg)
        coffee.display()
           
    if args.command == 'summarize':
        summarize.summarize(args.arg)

    if args.command == 'recommend':
        cf.recommend(args.arg)
示例#9
0
def summarize_official():
    """
    If there is only one document, gets the summary of that document.  Otherwise, asks the user which document do they want to 
    summarize. 

    Return: Summary or a question of which doc do they want to summarize
    """
    # print(len(TotalDocs), flush = True)
    if (len(TotalDocs) == 1):
        summary = summarize(db, TotalDocs[0], stopwords, summarizeLength = 2)
        # print(len(summary), flush = True)
        msg = "The summary is: {}".format(summary)
        return statement(msg)
    else:   
        return question("Which document? Give a number")
示例#10
0
def summarizeRightDoc(Number):
    """
    Parameter: The number corresponding to the document in which the user wants a summary

    Goes to the corresponding document and gets the summary for that document

    Return: Summary of specified document
    """
    Number = "{}".format(Number)
    # print(Number, flush = True)
    documentID = TotalDocs[int(Number) - 1]
    summary = summarize(db, documentID, stopwords)
    print("{}".format(summary), flush = True)
    msg = "The summary is: {}".format(summary)
    return statement(msg)
示例#11
0
文件: app.py 项目: waltherg/pubmed
def make_summary(pmid=None):
    r = requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
                       'elink.fcgi?dbfrom=pubmed&id=%d&cmd=prlinks'
                       '&retmode=json' % pmid)
    
    body = None
    if r.status_code == 200:
        xml = etree.fromstring(r.text)
        try:
            url = xml.xpath('//Url')[0].text
            full_text_r = requests.get(url)
            article = BS(full_text_r.text)
            paragraphs = article.findAll(['p'])
            body = ' '.join([p.text for p in paragraphs])
            summary = summarize(body, pmid)
        except:
            summary = ['PubMed provided no full text URL for PMID %d' %pmid]
            url = None

    return summary, url
示例#12
0
def genTextMetrics(raw_text):
    summary = summaryEngine.summarize(raw_text)
    svo = textEngine.extract(summary)

    final_text_data = {
        "summary": summary,
        "svo_data": []
    }

    for scene in svo:
        # print scene
        sent_subject = scene["raw_subject"] if len(scene["simple_subject"]) == 0 else scene["simple_subject"]
        sent_object = scene["raw_object"] if len(scene["simple_object"]) == 0 else scene["simple_object"]
        sent_predicate = scene["predicate"]

        file_urls = {}

        file_urls["subject"] = getImageFromString(sent_subject)
        file_urls["verb"] = getImageFromString(sent_predicate)
        if len(sent_object) != 0:
            # print "OBJECT"
            file_urls["object"] = getImageFromString(sent_object)

        sent_data = {
            "subject": {
                "text": sent_subject,
                "image": file_urls["subject"]
            },
            "verb": {
                "text": sent_predicate,
                "image": file_urls["verb"]
            },
            "object": {
                "text": sent_object,
                "image": file_urls["object"] if len(sent_object) != 0 else None
            }
        }

        final_text_data["svo_data"].append(sent_data)

    return final_text_data
示例#13
0
from summarize import summarize

text = "Alice and Bob are friends. Alice is fun and cuddly. Bob is cute and quirky. Together they go on wonderful adventures in the land of tomorrow. Alice's cuddliness and Bob's cuteness allow them to reach their goals. But before they get to them, they have to go past their mortal enemy — Mr. Boredom. He is ugly and mean. They will surely defeat him. He is no match for their abilities."
sentence_count = 2
language = 'english'
summary = summarize(text, sentence_count, language='english')

print(summary)
示例#14
0
def process():
    name = request.form['name']
    if name:
        return jsonify({'name': summarize.summarize(name)})

    return jsonify({'error': 'Missing data!'})
示例#15
0
 def test_single_sentence(self):
     text = "Alice is awesome"
     summary = summarize(text)
     self.assertEqual(text, summary)
示例#16
0
 def test_doesnt_crash_on_empty_sentences(self):
     try:
         summarize('. . .')
     except Exception as e:
         self.fail(e)
示例#17
0
from history import History
import scrape
from summarize import summarize

if __name__ == '__main__':
    history = History()
    target = scrape.get_article(scrape.url)
    print('\n\n\n')
    print(summarize(target[3], 1.25))

# This application is a work in progress and not meant to be run aside from testing purposes.
示例#18
0
def get_summary(text):
    return summarize(text,sentence_count=5, language='spanish')   
示例#19
0
import tag
import generate

__author__ = "imdreamrunner"
__email__ = "*****@*****.**"


logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)


if __name__ == "__main__":
    print("Welcome to Reader")
    args = sys.argv
    if len(args) != 2:
        print("Usage: python reader.py <fetch|generate>")
        exit(1)
    command = args[1]
    if command == "fetch":
        log.debug("Command: fetch")
        fetch.fetch_all()
        summarize.summarize()
        translate.translate()
        tag.tag()
    elif command == "generate":
        log.debug("Command: generate")
        generate.generate()
    else:
        print("Unknown command: " + command)
    log.info("Program exits.")
示例#20
0
n_estimators = 1000
max_features = 55
## max_features = int(X.shape[1])
## max_features='auto'


print 'Constructing random forest classifier from training set...'
sys.stdout.flush()
time0 = time.time()
rfor = ensemble.RandomForestClassifier(n_estimators=n_estimators,max_features=max_features,n_jobs=-1)
rfor = rfor.fit(X, Y)
dt = time.time() - time0
print '   that took %.1f seconds.\n' % dt


Y_pred = rfor.predict(X)
## print 'Training sample:'
## summarize(Y,Y_pred)


Ytest_pred = rfor.predict(Xtest)
print 'Test sample:'
summarize(Ytest,Ytest_pred)

## del X,Y,Y_pred
## del Xtest,Ytest,Ytest_pred

## del rfor

示例#21
0
from tfidf import TfidfModel

from summarize import summarize
from lib.db import load_docs_for_training, load_reviews_and_split_to_sentences

docs = load_docs_for_training()
tfidf = TfidfModel()
model, dictionary = tfidf.generate(docs)

target_id = 0

sentences_unfiltered = load_reviews_and_split_to_sentences(target_id)

summary_sentences = summarize(sentences_unfiltered,
                              model,
                              dictionary,
                              max_characters=70,
                              user_mmr=True,
                              sent_limit=50)
for sentence in summary_sentences:
    print(sentence.strip())
示例#22
0
def run(num=None):
    feeds, feedfileObject = load()
    mailserver = None
    try:
        # We store the default to address as the first item in the feeds list.
        # Here we take it out and save it for later.
        default_to = ""
        if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 
        else: ifeeds = feeds
        
        if num: ifeeds = [feeds[num]]
        feednum = 0
        
        for f in ifeeds:
            try: 
                feednum += 1
                if not f.active: continue
                
                if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url)
                r = {}
                try:
                    r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified)
                except TimeoutError:
                    print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url)
                    continue
                
                # Handle various status conditions, as required
                if 'status' in r:
                    if r.status == 301: f.url = r['url']
                    elif r.status == 410:
                        print >>warn, "W: feed gone; deleting", f.url
                        feeds.remove(f)
                        continue
                
                http_status = r.get('status', 200)
                if VERBOSE > 1: print >>warn, "I: http status", http_status
                http_headers = r.get('headers', {
                  'content-type': 'application/rss+xml', 
                  'content-length':'1'})
                exc_type = r.get("bozo_exception", Exception()).__class__
                if http_status != 304 and not r.entries and not r.get('version', ''):
                    if http_status not in [200, 302]: 
                        print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url)

                    elif contains(http_headers.get('content-type', 'rss'), 'html'):
                        print >>warn, "W: looks like HTML [%d] %s"  % (feednum, f.url)

                    elif http_headers.get('content-length', '1') == '0':
                        print >>warn, "W: empty page [%d] %s" % (feednum, f.url)

                    elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
                        print >>warn, "W: timed out on [%d] %s" % (feednum, f.url)
                    
                    elif exc_type == IOError:
                        print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url)
                    
                    elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
                        print >>warn, "W: broken compression [%d] %s" % (feednum, f.url)
                    
                    elif exc_type in socket_errors:
                        exc_reason = r.bozo_exception.args[1]
                        print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)

                    elif exc_type == urllib2.URLError:
                        if r.bozo_exception.reason.__class__ in socket_errors:
                            exc_reason = r.bozo_exception.reason.args[1]
                        else:
                            exc_reason = r.bozo_exception.reason
                        print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
                    
                    elif exc_type == AttributeError:
                        print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url)
                    
                    elif exc_type == KeyboardInterrupt:
                        raise r.bozo_exception
                        
                    elif r.bozo:
                        print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process"))

                    else:
                        print >>warn, "=== rss2email encountered a problem with this feed ==="
                        print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                        print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                        print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
                        print >>warn, r
                        print >>warn, "rss2email", __version__
                        print >>warn, "feedparser", feedparser.__version__
                        print >>warn, "html2text", h2t.__version__
                        print >>warn, "Python", sys.version
                        print >>warn, "=== END HERE ==="
                    continue
                
                r.entries.reverse()
                
                for entry in r.entries:
                    id = getID(entry)
                    
                    # If TRUST_GUID isn't set, we get back hashes of the content.
                    # Instead of letting these run wild, we put them in context
                    # by associating them with the actual ID (if it exists).
                    
                    frameid = entry.get('id')
                    if not(frameid): frameid = id
                    if type(frameid) is DictType:
                        frameid = frameid.values()[0]
                    
                    # If this item's ID is in our database
                    # then it's already been sent
                    # and we don't need to do anything more.
                    
                    if frameid in f.seen:
                        if f.seen[frameid] == id: continue

                    if not (f.to or default_to):
                        print "No default email address defined. Please run 'r2e email emailaddress'"
                        print "Ignoring feed %s" % f.url
                        break
                    
                    if 'title_detail' in entry and entry.title_detail:
                        title = entry.title_detail.value
                        if contains(entry.title_detail.type, 'html'):
                            title = html2text(title)
                    else:
                        title = getContent(entry)[:70]

                    title = title.replace("\n", " ").strip()
                    
                    when = time.gmtime()

                    if DATE_HEADER:
                        for datetype in DATE_HEADER_ORDER:
                            kind = datetype+"_parsed"
                            if kind in entry and entry[kind]: when = entry[kind]
                        
                    link = entry.get('link', "")
                    
                    from_addr = getEmail(r, entry)
                    
                    name = h2t.unescape(getName(r, entry))
                    fromhdr = formataddr((name, from_addr,))
                    tohdr = (f.to or default_to)
                    subjecthdr = title
                    datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", when)
                    useragenthdr = "rss2email"
                    
                    # Add post tags, if available
                    tagline = ""
                    if 'tags' in entry:
                        tags = entry.get('tags')
                        taglist = []
                        if tags:
                            for tag in tags:
                                taglist.append(tag['term'])
                        if taglist:
                            tagline = ",".join(taglist)
                    
                    extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'Message-ID': '<%s>' % hashlib.sha1(id.encode('utf-8')).hexdigest(), 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline, 'X-MUNGED-FROM': getMungedFrom(r), 'References': ''}
                    if BONUS_HEADER != '':
                        for hdr in BONUS_HEADER.strip().splitlines():
                            pos = hdr.strip().find(':')
                            if pos > 0:
                                extraheaders[hdr[:pos]] = hdr[pos+1:].strip()
                            else:
                                print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER 
                    
                    entrycontent = getContent(entry, HTMLOK=HTML_MAIL)
                    contenttype = 'plain'
                    content = ''
                    if THREAD_ON_TAGS and len(tagline):
                        extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(t.strip().encode('utf-8')).hexdigest() for t in tagline.split(',')])
                    if USE_CSS_STYLING and HTML_MAIL:
                        contenttype = 'html'
                        content = "<html>\n" 
                        content += '<head><meta http-equiv="Content-Type" content="text/html"><style>' + STYLE_SHEET + '</style></head>\n'
                        content += '<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">\n'
                        content += '<div id="entry">\n'
                        content += '<h1 class="header"'
                        content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
                        if ishtml(entrycontent):
                            body = entrycontent[1].strip()
                            if SUMMARIZE:
                                content += '<div class="summary">%s</div>' % (summarize(html2text(body, plaintext=True), SUMMARIZE) + "<hr>")
                        else:
                            body = entrycontent.strip()
                            if SUMMARIZE:
                                content += '<div class="summary">%s</div>' % (summarize(body, SUMMARIZE) + "<hr>")
                        if THREAD_ON_LINKS:
                            parser = Parser()
                            parser.feed(body)
                            extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(h.strip().encode('utf-8')).hexdigest() for h in parser.attrs])
                        if INLINE_IMAGES_DATA_URI:
                            parser = Parser(tag='img', attr='src')
                            parser.feed(body)
                            for src in parser.attrs:
                                try:
                                    img = feedparser._open_resource(src, None, None, feedparser.USER_AGENT, link, [], {})
                                    data = img.read()
                                    if hasattr(img, 'headers'):
                                        headers = dict((k.lower(), v) for k, v in dict(img.headers).items())
                                        ctype = headers.get('content-type', None)
                                        if ctype and INLINE_IMAGES_DATA_URI:
                                            body = body.replace(src,'data:%s;base64,%s' % (ctype, base64.b64encode(data)))
                                except:
                                    print >>warn, "Could not load image: %s" % src
                                    pass
                        if body != '':  
                            content += '<div id="body">\n' + body + '</div>\n'
                        content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>'
                        if hasattr(entry,'enclosures'):
                            for enclosure in entry.enclosures:
                                if (hasattr(enclosure, 'url') and enclosure.url != ""):
                                    content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
                                if (hasattr(enclosure, 'src') and enclosure.src != ""):
                                    content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
                        if 'links' in entry:
                            for extralink in entry.links:
                                if ('rel' in extralink) and extralink['rel'] == u'via':
                                    extraurl = extralink['href']
                                    extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
                                    viatitle = extraurl
                                    if ('title' in extralink):
                                        viatitle = extralink['title']
                                    content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n'
                        content += '</p></div>\n'
                        content += "\n\n</body></html>"
                    else:   
                        if ishtml(entrycontent):
                            contenttype = 'html'
                            content = "<html>\n" 
                            content = ("<html><body>\n\n" + 
                                       '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' +
                                       entrycontent[1].strip() + # drop type tag (HACK: bad abstraction)
                                       '<p>URL: <a href="'+link+'">'+link+'</a></p>' )
                                       
                            if hasattr(entry,'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink) and extralink['rel'] == u'via':
                                        content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
                                                                
                            content += ("\n</body></html>")
                        else:
                            content = entrycontent.strip() + "\n\nURL: "+link
                            if hasattr(entry,'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('\nEnclosure: ' + enclosure.url + "\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink) and extralink['rel'] == u'via':
                                        content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'

                    mailserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, when, extraheaders, mailserver, f.folder)
            
                    f.seen[frameid] = id
                    
                f.etag, f.modified = r.get('etag', None), r.get('modified', None)
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                print >>warn, "=== rss2email encountered a problem with this feed ==="
                print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                print >>warn, "E: could not parse", f.url
                traceback.print_exc(file=warn)
                print >>warn, "rss2email", __version__
                print >>warn, "feedparser", feedparser.__version__
                print >>warn, "html2text", h2t.__version__
                print >>warn, "Python", sys.version
                print >>warn, "=== END HERE ==="
                continue

    finally:        
        unlock(feeds, feedfileObject)
        if mailserver:
            if IMAP_MARK_AS_READ:
                for folder in IMAP_MARK_AS_READ:
                    mailserver.select(folder)
                    res, data = mailserver.search(None, '(UNSEEN UNFLAGGED)')
                    if res == 'OK':
                        items = data[0].split()
                        for i in items:
                            res, data = mailserver.fetch(i, "(UID)")
                            if data[0]:
                                u = uid(data[0])
                                res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Seen)')
            if IMAP_MOVE_READ_TO:
                typ, data = mailserver.list(pattern='*')
                # Parse folder listing as a CSV dialect (automatically removes quotes)
                reader = csv.reader(StringIO.StringIO('\n'.join(data)),dialect='mailboxlist')
                # Iterate over each folder
                for row in reader:
                    folder = row[-1:][0]
                    if folder == IMAP_MOVE_READ_TO or '\Noselect' in row[0]:
                        continue
                    mailserver.select(folder)
                    yesterday = (datetime.now() - timedelta(days=1)).strftime("%d-%b-%Y")
                    res, data = mailserver.search(None, '(SEEN BEFORE %s UNFLAGGED)' % yesterday)
                    if res == 'OK':
                        items = data[0].split()
                        for i in items:
                            res, data = mailserver.fetch(i, "(UID)")
                            if data[0]:
                                u = uid(data[0])
                                res, data = mailserver.uid('COPY', u, IMAP_MOVE_READ_TO)
                                if res == 'OK':
                                    res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Deleted)')
                                    mailserver.expunge()
            try:
                mailserver.quit()
            except:
                mailserver.logout()
示例#23
0
def summarize_text():
    to_summarize = str(haven.get_text(request.form['url']))
    return summarize.summarize(to_summarize)
示例#24
0
def get_concepts():
    url = request.form['url']
    data = json.dumps(haven.analysis(request.form['url'], False))
    summary = str(haven.get_text(request.form['url']))

    return render_template('learn.html', data=data, url=url, summary=summarize.summarize(summary))
示例#25
0
def test_summarize_custom():
    assert summarize.summarize([1, 1, 1, 1, 1]) == 1
示例#26
0
def process_request():
    if request.method == 'POST':
        url = request.data.get("url_field")
        #print(url)
        key = request.data.get("keywords")
        key = key[1:-1].split(",")
        keywords=[]
        for k in key:
            keywords.append(k[1:-1])
        r = requests.get(url)
        soup = BeautifulSoup(r.content, "html.parser")
        categories = ['new act', 'new rule', 'new regulation', 'notification', 'circular', 'press release',
                      'scheme', 'order', 'ordinance', 'amendment', 'resolution', 'bill', 'report', 'guideline',
                      'direction', 'clarification', 'master direction','revised']
        cat_map = dict()

        for k in keywords:
            india_links = lambda tag: (getattr(tag, 'name', None) == 'a' and
                                       'href' in tag.attrs and
                                       k in tag.get_text().lower())
            results = soup.find_all(india_links)
            extracted = []
            for i in results:
                p = i.get('href')
                i.find('title')
                l=[]
                l.append(i.contents[0])
                l.append(p)
                extracted.append(l)

            for z in extracted:
                flag = 0
                for cat in categories:

                    if cat in z[0].lower():

                        if cat in cat_map:

                            cat_map[cat].append(z)
                            flag = 1
                            break
                        else:
                            cat_map[cat] = [z]
                            flag = 1
                            break
                if flag == 0:
                    if 'others' in cat_map:
                        cat_map['others'].append(z)
                        flag = 1

                    else:
                        cat_map['others'] = [z]
        absUrl = 'http://www.sebi.gov.in/'
        for k, v in cat_map.items():
            for q in range (len(v)):
                url = v[q][1]
                
                r = requests.get(url)
                soup = BeautifulSoup(r.content, "html.parser")

                for i in soup.find_all('iframe'):
                    innerLinks = i.get('src')
                    pdfLink = absUrl + innerLinks[28:]
                    pdfLink = str(pdfLink)
                    print (pdfLink)
                # url = 'http://www.sebi.gov.in/web/?file=../../../sebi_data/attachdocs/nov-2017/1509707086156.pdf'
                    url = pdfLink
                # writer = PdfFileWriter()
                    pdf = pdfx.PDFx(url)
                # metadata = pdf.get_metadata()

                    references_dict = pdf.get_references_as_dict()
                    metadata = pdf.get_metadata()
                    text = pdf.get_text()
                    z = summarize(text, sentence_count=4, language='english')
                    v[q].append(z)
                    v[q].append(references_dict)
                    v[q].append(metadata)

        return cat_map, status.HTTP_200_OK
def run_model(destination, subsample=None, min_Nflights=None):

    ### Unpickle datasets
    print 'Unpickling datasets...'
    time0 = time.time()
    filename = '/data/DelayMeNot/data/pickles_by_destination/datasets_%s.pkl' % destination
    (data_train, data_test) = cd.load_from_pickle(filename, gzip=True)
    print '   that took %.1f seconds.' % (time.time() - time0)


    ### subsample training data
    if subsample:
        if len(data_train) > 1e6:
            Nsub = int( float(len(data_train)) / 1e6 )
            data_train = data_train.ix[::Nsub]


    ### remove all routes with less than min_Nflights flights
    grouped = data_train.groupby('Origin')
    Nflights = grouped['Origin'].count()
    Nflights.sort(ascending=True)
    if min_Nflights:
        orig_list = list(Nflights[Nflights > min_Nflights].index)
        if len(orig_list) == 0:
            print 'Found no routes with more than %d flights!' % min_Nflights
            return 0
        data_train = data_train[data_train['Origin'].isin(orig_list)]
        data_test = data_test[data_test['Origin'].isin(orig_list)]
    else:
        orig_list = list(Nflights.index)

    ## return data_train, data_test
                
    ### "Dummify" the categorical 'Carrier' and 'Origin' columns,
    ### and add the dummies to the table, but drop the first dummy
    ### column to avoid "dummy variable trap".

    print 'Dummifying datasets...'
    time0 = time.time()

    dummies = pd.get_dummies(data_train['Carrier'],prefix='Carrier')
    data_train = data_train.join(dummies.ix[:,1:])
    dummies = pd.get_dummies(data_test['Carrier'],prefix='Carrier')
    data_test = data_test.join(dummies.ix[:,1:])

    dummies = pd.get_dummies(data_train['Origin'],prefix='Origin')
    data_train = data_train.join(dummies.ix[:,1:])
    dummies = pd.get_dummies(data_test['Origin'],prefix='Origin')
    data_test = data_test.join(dummies.ix[:,1:])

    ### Drop dummified columns
    data_train = data_train.drop(['Carrier','Origin'],axis=1)
    data_test = data_test.drop(['Carrier','Origin'],axis=1)
    
    print '   that took %.1f seconds.' % (time.time() - time0)
    
    ### Training set columns
    train_cols = list(data_train.columns)
    train_cols.remove('ArrivalDelay')
    
    ### Add any missing training columns to test dataset
    test_cols = list(data_test.columns)
    for tc in train_cols:
        if tc not in test_cols:
            data_test[tc] = np.zeros_like(data_test[test_cols[0]])
    
    ### Define training and test data set variables
    late_delay = 30.0

    X = data_train[train_cols].values.copy()
    Y = np.zeros_like(data_train['ArrivalDelay'].values)
    Y[data_train['ArrivalDelay'].values > late_delay] = 1

    Xtest = data_test[train_cols].values.copy()
    Ytest = np.zeros_like(data_test['ArrivalDelay'].values)
    Ytest[data_test['ArrivalDelay'].values > late_delay] = 1

    del data_train, data_test
    
    ### Train the RandomForest model
    ## n_estimators = 1000
    n_estimators = 128
    ## max_features = 'auto'
    max_features = int(X.shape[1]/2)
    
    print 'Constructing random forest classifier from training set...'
    print '   Number of flights in training data set = %d' % len(Y)
    sys.stdout.flush()
    time0 = time.time()
    rfor = ensemble.RandomForestClassifier(n_estimators=n_estimators,max_features=max_features,n_jobs=8)
    rfor = rfor.fit(X, Y)
    rfor.n_jobs = 1

    Y_pred = rfor.predict(X)
    train_summary = summarize(Y,Y_pred)

    dt_train = time.time() - time0
    print '   that took %.1f seconds.\n' % dt_train
    sys.stdout.flush()

    
    ### Test the model
    print 'Testing the model...'
    time0 = time.time()
    Ytest_pred = rfor.predict(Xtest)
    test_summary = summarize(Ytest,Ytest_pred)
    dt_test = (time.time() - time0)
    print '   that took %.1f seconds.' % dt_test
    sys.stdout.flush()

    
    ### Construct model summary dict
    model_summary = {}
    model_summary['training_columns'] = train_cols
    model_summary['training'] = train_summary
    model_summary['time_to_train'] = dt_train

    model_summary['test'] = test_summary
    model_summary['time_to_test'] = dt_test

    model_summary['late_delay'] = late_delay
    if subsample:
        model_summary['subsample'] = True
        model_summary['Nsub'] = Nsub
    else:
        model_summary['subsample'] = False
    model_summary['min_Nflights'] = min_Nflights
    model_summary['n_estimators'] = n_estimators
    model_summary['max_features'] = max_features

        
    ### Pickle the result
    print 'Pickling the result...'
    time0 = time.time()

    filename = '../RandomForest_models/by_destination/rfm_%s.pkl' % destination
    f = open(filename, 'wb')
    cPickle.dump((rfor,model_summary),f,2)
    f.close()
    subprocess.call('gzip %s' % filename, shell=True)
    print '   that took %.1f seconds.' % (time.time() - time0)
示例#28
0
def queryToDocument():
    """
    Uses the inputted query and returns all the documents relating to the query.  It then prompts the user if they want a summary
    of a document.  
    Return: All documents relating to query and a question if the user wants a summary
    """
    finalDocs = db.engine.query(Query, 3)
    finalDocs = [i[0] for i in finalDocs]
    print(finalDocs, flush = True)
    wikipediaString = "wikipedia"
    cnnString = "cnn"
    reuterString = "reteurs"
    global TotalDocs
    if len(finalDocs) == 1:
        if wikipediaString in finalDocs[0]:
            source = wikipediaString
        elif cnnString in finalDocs[0]:
            source = cnnString
        elif reuterString in finalDocs[0]:
            source = reuterString
        summary = summarize(db, finalDocs[0], stopwords)
        title = getTitle()
        image_msg = "The top document is {}".format(title) + " from {}".format(source) + "."
        image_msg += " Would you like a summary of this document?"
        TotalDocs = finalDocs
        return question(image_msg)
    elif len(finalDocs) == 2:
        if wikipediaString in finalDocs[0]:
            source1 = wikipediaString
        if cnnString in finalDocs[0]:
            source1 = cnnString 
        if reuterString in finalDocs[0]:
            source1 = reuterString
        if wikipediaString in finalDocs[1]: 
            source2 = wikipediaString 
        if cnnString in finalDocs[1]:
            source2 = cnnString
        if reuterString in finalDocs[1]:
            source2 = reuterString
        filler = summarize(db, finalDocs[0], stopwords)
        title1 = getTitle() 
        filler2 = summarize(db, finalDocs[1], stopwords)[0]
        title2 = getTitle()
        image_msg = "The top documents are {}".format(finalDocs[0]) + " from {}".format(source1) + " and " + "{}".format(finalDocs[1]) + " from {}".format(source2)
        image_msg += " Would you like a summary of a document?"
        TotalDocs = finalDocs
        return statement(image_msg)
    image_msg = "The top documents are "
    for i in range(len(finalDocs) - 1): 
        if wikipediaString in finalDocs[i]:
            source = wikipediaString
        elif cnnString in finalDocs[i]: 
            source = cnnString
        elif reuterString in finalDocs[i]:
            source = reuterString  
        #filler = summarize(db, finalDocs[i], stopwords, summarizeLength = 5)
        #title = getTitle()
        image_msg += "{}".format(finalDocs[i]) + " from {}".format(source)
        image_msg += ", "
    image_msg += "and "
    fillerrr = summarize(db, finalDocs[-1], stopwords)  
    titleLast = getTitle()
    image_msg += "{}".format(titleLast)
    image_msg +=  " Would you like a summary of a document?"
    TotalDocs = finalDocs
    return statement(image_msg)
示例#29
0
def test_summarize_seed():
    np.random.seed(5)
    numbers = summarize.gen_numbers(5)
    assert summarize.summarize(numbers) == np.mean([99, 78, 61, 16, 73])
示例#30
0
def createsummary(options, totalprocs, procid):

    procidstr = "%s of %s " % (procid, totalprocs) if totalprocs != None else ""

    logging.info("Processor " + procidstr + "starting")

    referencetime = int(time.time()) - ( 7 * 24 * 3600 ) 

    config = account.getconfig(options['config'])
    dbconf = config['accountdatabase']

    outdb = output.factory(config['outputdatabase'])

    ratecalc = RateCalculator(procid)
    timewindows = dict()

    for resourcename, settings in config['resources'].iteritems():

        if 'enabled' in settings:
            if settings['enabled'] == False:
                continue

        if options['resource'] not in (None, resourcename, str(settings['resource_id'])):
            continue

        processtimes = { "mintime": 2**64, "maxtime": 0 }

        dbreader = account.DbAcct( settings['resource_id'], dbconf, PROCESS_VERSION, totalprocs, procid, options['localjobid'])

        bacct = batch_acct.factory(settings['batch_system'], settings['acct_path'], settings['host_name_ext'] )

        if settings['lariat_path'] != "":
            lariat = summarize.LariatManager(settings['lariat_path'])
        else:
            lariat = None

        dbwriter = account.DbLogger( dbconf["dbname"], dbconf["tablename"], dbconf["defaultsfile"] )

        for acct in dbreader.reader():
            logging.debug("%s local_job_id = %s", resourcename, acct['id'])
            job = job_stats.from_acct( acct, settings['tacc_stats_home'], settings['host_list_dir'], bacct )
            summary,timeseries = summarize.summarize(job, lariat)

            insertOk = outdb.insert(resourcename, summary, timeseries)

            if summary['complete'] == False and summary["acct"]['end_time'] > referencetime:
                # Do not mark incomplete jobs as done unless they are older than the
                # reference time (which defaults to 7 days ago)
                dbwriter.logprocessed(acct, settings['resource_id'], ERROR_INCOMPLETE)
                continue
            
            if insertOk:
                dbwriter.logprocessed( acct, settings['resource_id'], PROCESS_VERSION )
                processtimes['mintime'] = min( processtimes['mintime'], summary["acct"]['end_time'] )
                processtimes['maxtime'] = max( processtimes['maxtime'], summary["acct"]['end_time'] )
                ratecalc.increment()
            else:
                # Mark as negative process version to indicate that it has been processed
                # but no summary was output
                dbwriter.logprocessed( acct, settings['resource_id'], 0 - PROCESS_VERSION )

        if processtimes['maxtime'] != 0:
            timewindows[resourcename] = processtimes

    logging.info("Processor " + procidstr + "exiting. Processed %s", ratecalc.count)

    if ratecalc.count == 0:
        # No need to generate a report if no docs were processed
        return

    proc = { "host": socket.getfqdn(),
            "instance": procid,
            "totalinstances": totalprocs,
            "start_time": ratecalc.starttime,
            "end_time": time.time() ,
            "rate": ratecalc.rate,
            "records": ratecalc.count
            }

    report = { "proc": proc, "resources": timewindows }

    outdb.logreport(report)
示例#31
0
def main():

    # Import the self-created "volumes" module and the given "summarize" module.
    import volumes
    import summarize

    validInput = ["cube", "c", "pyramid", "p", "ellipsoid", "e", "quit",
                  "q"]  # All valid user inputs when asking for shape
    valid = False  # Used to keep while loops running until valid input
    isShape = True  # Used to see if user input is a shape or quit
    index = 0  # Tracks the index of user input within "validInput" list

    # Lists of all the shapes to keep track of calculated volumes
    cubeVolumes = []
    pyramidVolumes = []
    ellipsoidVolumes = []

    # Introduction
    print("~Welcome to the Volume Calculator.~")
    print("")
    print("")

    # While loop to ask for a valid test case number
    while not valid:

        testCase = input(
            "Enter the test case number: ")  # Ask for test case number

        # If the input is an integer, set the testCase variable to input and change 'valid' to True to exit while loop
        if testCase.isnumeric():
            testCase = int(testCase)
            valid = True
        else:
            print(
                "Sorry, the test case must be a number."
            )  # If the input is not an integer, print message and loop again

    valid = False  # Reset 'valid' for next while loop

    # While loop that keeps running until the user enters 'quit' or 'q'
    while isShape:

        # While loop that asks for user input, calculates volume (if necessary) and keeps running until quit input.
        while not valid:

            shape = str(input("Please enter a shape: "))  # Ask for input
            shape = shape.lower()  # Convert to lower case

            # If the input is within the list of valid inputs, track down the index within the list and set 'valid'
            # to true. This allows the program to exit the while loop.
            if shape in validInput:
                index = validInput.index(shape)
                valid = True
            else:
                print(
                    "Sorry, your input is invalid."
                )  # Print error message for invalid input, and loop again
                print("")

        # If the user input was "cube" or "c", perform necessary actions.
        if index in range(0, 2):

            sideLength = int(input("Enter the side length of the cube: ")
                             )  # Ask the user for side length
            currentVolume = volumes.cubeVolume(
                sideLength
            )  # Calculate the volume by sending sidelength to method
            # "cubeVolume" within 'volumes' module.
            cubeVolumes.append(
                currentVolume)  # Add the volume to the cubes list

        # If the user input was "pyramid" or "p", perform necessary actions.
        elif index in range(2, 4):

            baseLength = int(input("Enter the base length of the pyramid: ")
                             )  # Ask the user for base length
            height = int(input("Enter the height of the pyramid: ")
                         )  # Ask the user for height
            currentVolume = volumes.pyramidVolume(
                baseLength,
                height)  # Calculate the volume by sending both base and
            # height to method "pyramidVolume" within
            # 'volumes' module.
            pyramidVolumes.append(
                currentVolume)  # Add the volume to the pyramids list

        # If the user input was "ellipsoid" or "e", perform necessary actions.
        elif index in range(4, 6):

            # Ask for the three radii of the ellipsoid
            radius1 = int(input("Enter the first radius: "))
            radius2 = int(input("Enter the second radius: "))
            radius3 = int(input("Enter the third radius: "))
            currentVolume = volumes.ellipsoidVolume(
                radius1, radius2,
                radius3)  # Calculate the volume by sending the
            # 3 radii to method 'ellipsoidVolume'
            # within 'volumes' module.
            ellipsoidVolumes.append(
                currentVolume)  # Add the volume to the ellipsoids list.

        # If the user input was "quit" or "q", change 'isShape' to false to allow program to exit loop
        elif index in range(6, 8):

            isShape = False

        valid = False  # Reset 'valid' to false, allowing program to loop through asking user for shape input again
        # if necessary.

    # Sort the volumes within each shape volume list in ascending order.
    cubeVolumes.sort()
    pyramidVolumes.sort()
    ellipsoidVolumes.sort()

    # Notify that the session has finished.
    print("")
    print("")
    print("You have reached the end of your session.")

    # If the user has not performed any calculations, print appropriate message
    if len(cubeVolumes) == 0 and len(pyramidVolumes) == 0 and len(
            ellipsoidVolumes) == 0:
        print("You did not perform any volume calculations.")
    else:

        print("The volumes calculated for each shape are:")

        # If there are calculated cube volumes, print them.
        if len(cubeVolumes) != 0:
            print("Cube: ", cubeVolumes)
        else:
            print("Cube: No shapes entered"
                  )  # If there are no cube volumes, print appropriate message

        # If there are calculated pyramid volumes, print them.
        if len(pyramidVolumes) != 0:
            print("Pyramid: ", pyramidVolumes)
        else:
            print(
                "Pyramid: No shapes entered"
            )  # If there are no pyramid volumes, print appropriate message

        # If there are calculated ellipsoid volumes, print them.
        if len(ellipsoidVolumes) != 0:
            print("Ellipsoid: ", ellipsoidVolumes)
        else:
            print(
                "Ellipsoid: No shapes entered"
            )  # If there are no ellipsoid volumes, print appropriate message

    # Within the 'summarize' module, send all the lists of volumes and test case number to the "summarize" method.
    # This will print them to a text file with the appropriate test case number.
    summarize.summarize(cubeVolumes, pyramidVolumes, ellipsoidVolumes,
                        testCase)