Exemplo n.º 1
0
 def post(self, request, format=None):
     url = request.data.get('url')
     source = request.data.get('source')
     s = SummarizeUrl(url)
     foo = []
     for x in s:
         foo.append(remove_non_ascii(x))
     summary = ' '.join(foo)
     hdr = {
         'User-Agent':
         'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
         'Accept-Encoding': 'none',
         'Accept-Language': 'en-US,en;q=0.8',
         'Connection': 'keep-alive'
     }
     req = urllib2.Request(url, headers=hdr)
     page = urllib2.urlopen(req)
     p = parse(page)
     title = p.find(".//title").text
     data = {
         'title': title,
         'url': url,
         'summarize_url': summary,
         'source': source
     }
     serializer = SummarySerializer(data=data)
     if serializer.is_valid():
         # serializer.save()
         return Response(serializer.data, status=status.HTTP_201_CREATED)
     return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
Exemplo n.º 2
0
def get_summary(url):
    summaries = SummarizeUrl(url)

    summary = []
    for elem in summaries:
        summary.append(elem)
    print ' '.join(summary)
Exemplo n.º 3
0
def catchContent():
    number_bb = t.count(
        '(//div[contains(@data-vr-zone, "Top Stories")]//span[contains(@class, "story-headline")])'
    )

    df_bb = pd.DataFrame(index=range(0, number_bb - 2),
                         columns=['Sno', 'Title', 'URL', 'Summary', 'Img_URL'])

    for n in range(0, number_bb - 2):
        title = t.read(
            '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]'
            .format(n))
        URL_b = t.read(
            '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]//@href'
            .format(n))
        URL = "https://www.straitstimes.com/" + str(URL_b)
        Img_URL = t.read(
            '//div[contains(@data-vr-zone, "Top Stories {}")]//span[contains(@class, "story-headline")]/ancestor::div[contains(@class, "body")]/..//img/@src'
            .format(n))
        summaries = SummarizeUrl(URL)
        df_bb.iloc[n, 0] = n
        df_bb.iloc[n, 1] = title
        df_bb.iloc[n, 2] = URL
        df_bb.iloc[n, 3] = summaries
        df_bb.iloc[n, 4] = Img_URL

    return df_bb
Exemplo n.º 4
0
def fact_extract(user_input):

    #user_input = str(raw_input('Enter input: ')) this will come from the front end instead of cli
    url = wikipedia.page(user_input).url

    # s = Summarize(user_input, wikipedia.page(user_input).content) is to summarize if the url is not available
    # this one is for extracting the content directly from the keyword entered by the user
    s = SummarizeUrl(url)

    summ = ' '.join(word for word in s)

    #summ = summ.replace("]","")

    #non reg ex way of dealing with expressions
    def removeNestedParentheses(s):
        ret = ''
        skip = 0
        for i in s:
            if i == '[':
                skip += 1
            elif i == ']' and skip > 0:
                skip -= 1
            elif skip == 0:
                ret += i
        return ret

    summ = removeNestedParentheses(summ)
    summ = summ.replace(". ", ".\n\n")

    return summ
Exemplo n.º 5
0
def get_data(rss, num):
    pathToCSV = '../fileStore/file.csv'
    data = []
    with open(pathToCSV, 'w') as csvfile:
        wr = csv.writer(csvfile, delimiter='@', quotechar='#')
        index = 0
        for e in rss['entries']:
            if (index == int(num)):
                break

            wr.writerow([(e['title']).encode('utf-8')])
            wr.writerow([(e['link']).encode('utf-8')])

            summary = []
            try:
                for elem in SummarizeUrl(e['link'].encode('utf-8')):
                    summary.append(elem)
                wr.writerow([
                    ' '.join(summary).encode('utf-8').strip().replace(
                        '\n', '')
                ])
            except TypeError:
                wr.writerow(['Summary Unavailable'])

            index = index + 1
Exemplo n.º 6
0
def main(url):
    # open db connection
    conn = sqlite3.connect('../db/summaries.db')
    c = conn.cursor()
    # check if exists

    u = (url, )
    rows = c.execute('SELECT COUNT(*) FROM summaries WHERE url=?',
                     u).fetchone()
    if (rows[0] != 0):
        #print url + " was found in db"
        conn.close()
        return

    # add the new article
    summaries = SummarizeUrl(url)
    if summaries is None:
        conn.close()
        return
    sums = " ".join(summaries).replace('\n', '. ')

    stage = (url, sums, date.today())
    c.execute('INSERT INTO summaries VALUES (?,?,?)', stage)

    # save & close
    conn.commit()
    conn.close()

    # delay so we dont overflow the site
    print url + " added"
    time.sleep(5)
Exemplo n.º 7
0
    def testURLs(self):
        urls = (
            u'http://www.huffingtonpost.com/2013/11/22/twitter-forward-secrecy_n_4326599.html',
            u'http://www.bbc.co.uk/news/world-europe-30035666',
            u'http://www.bbc.co.uk/news/magazine-29631332')

        #just make sure it doesn't crash
        for url in urls:
            summaries = SummarizeUrl(url)
Exemplo n.º 8
0
def fact_extract(user_input):
    url = wikipedia.page(user_input).url
    error = ''
    # s = Summarize(user_input, wikipedia.page(user_input).content) is to summarize if the url is not available
    # this one is for extracting the content directly from the keyword entered by the user
    s = SummarizeUrl(url)
    try:
        summ = ' '.join(word for word in s)
    except TypeError:
        return error
    summ = removeNestedParentheses(summ)
    summ = summ.replace(". ", ".\n\n")
    return summ
Exemplo n.º 9
0
def summary(request):
    title = 'Summary'
    form = SummaryForm(request.POST or None)
    response_data = {}
    context = {
        "title": title,
        "form": form,
    }
    if request.method == 'POST':
        instance = form.save(commit=False)
        url = instance.url
        if form.is_valid():
            summary = SummarizeUrl(url)
            instance.summarize_url = summary
            hdr = {
                'User-Agent':
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Accept-Encoding': 'none',
                'Accept-Language': 'en-US,en;q=0.8',
                'Connection': 'keep-alive'
            }
            req = urllib2.Request(url, headers=hdr)
            page = urllib2.urlopen(req)
            p = parse(page)
            title = p.find(".//title").text
            instance.title = title
            # instance.save()
            response_data['url'] = instance.url
            response_data['title'] = title
            response_data['summary'] = instance.summarize_url
            context = {
                "title": title,
                "url": url,
                "summary": summary,
            }
            return JsonResponse(response_data)
        else:
            return JsonResponse({"nothing to see": "this isn't happening"})

    else:
        form = SummaryForm()

    return render(request, "website/index.html", context)
Exemplo n.º 10
0
def process(links):
    for s in range(1, 6):
        spath = "story" + str(s) + ".txt"
        delete_file_contents(spath)
    s = 1
    i = 0

    for link in links:
        try:
            i += 1
            spath = "story" + str(s) + ".txt"
            summaries = SummarizeUrl(link)
            append_file(spath, summaries, 0)
            if i == 3:
                i = 0
                s += 1
                retrimmer(spath)
        except:
            pass
Exemplo n.º 11
0
def run():
	conn = util.create_connection("./db/news.db")
	site = util.getSiteByName(conn, "BBC News")
	site_url = site[0][2]
	site_id = site[0][0]

	results, url, number_BBC, Img_link = NewsFromBBC(site_url)
	df = pd.DataFrame(index=range(0,number_BBC-1), columns = ['Sno', 'Title', 'URL','Summary', 'Img_URL'])
	
	for i in range(1, number_BBC):
		summaries = SummarizeUrl(url[i])

		df.iloc[i-1, 0] = i
		df.iloc[i-1, 1] = results[i]
		df.iloc[i-1, 2] = url[i]
		df.iloc[i-1, 3] = summaries
		df.iloc[i-1, 4] = Img_link[i]

	df = util.fixImgLink(df, "https://cf-templates-fghyux9ggb7t-ap-southeast-1.s3-ap-southeast-1.amazonaws.com/bbc.png")
	df = util.fixSummary(df)

	util.updateNews(conn, site_id, df)
Exemplo n.º 12
0
def getRSSFeedAttributes(feed, dangerFactors, citiesUSA, c, returnList, Errors, feedErrors, linkErrors):
	a = feedparser.parse(feed)
	
	for posts in a.entries:
		
		now = datetime.datetime.now()
		Date_Of_Access = str(now)

		# if 'published' in posts:
		try:
			date_published = posts['published']
			dtobject = parser.parse(date_published)
			date_published_reformated = dtobject.strftime('%Y-%m-%d %H:%M:%S')
		except Exception, e:
			date_published_reformated = None


		title = posts.title
		title = title.replace("'","''")
		if title:
			link = posts.link
			if link:
				factorList = []
				domain = []
				locationList = []
				wordFreqFactor = []
				wordFreqLocation = []
				try:
					summary =  (SummarizeUrl(link))
				except Exception, e:
					SummaryError = str(e)
					Errors.append(SummaryError)
					feedErrors.append(feed)
					linkErrors.append(link)
				if summary:
					summary = u' '.join(summary)
					summary = summary.replace("'","''").replace("\n"," ")
					summaryLower = summary.lower()
					getFactorsFromText(summaryLower, summary, factorList, domain, locationList, wordFreqFactor, wordFreqLocation, dangerFactors, citiesUSA, c, title, link, date_published_reformated, Date_Of_Access, returnList, feed)
Exemplo n.º 13
0
def catchContent():
	number = t.count('(//li[contains(@class, "css-1iski2w")]/a)')
	df = pd.DataFrame(index=range(0,number), columns = ['Sno', 'Title', 'URL', 'Summary','Img_URL'])

	for n in range(1, number+1):
		title=t.read('//li[contains(@class, "css-1iski2w")][{}]/a/div'.format(n))
		URL=t.read('//li[contains(@class, "css-1iski2w")][{}]//@href'.format(n))
		Img_link=t.read('//li[contains(@class, "css-1iski2w")][{}]//img/@src'.format(n))
		summaries = SummarizeUrl(URL)

		df.iloc[n-1, 0] = n
		df.iloc[n-1, 1] = title.decode('utf-8')
		df.iloc[n-1, 2] = URL
		df.iloc[n-1, 3] = summaries
		df.iloc[n-1, 4] = Img_link

	df['Summary'].replace('None', np.nan, inplace=True)
	df.dropna(subset=['Summary'], inplace=True, how='any')
	df= df.reset_index(drop=True)
	df['Sno'] = df.index

	return df
Exemplo n.º 14
0
            url_list.append('https://news.google.com/{}'.format(raw[i]))
        except IndexError:
            # Comment plez
            url_list.append('https://news.google.com/{}'.format(raw[i]))
        i += 1

    for q in range(12):
        url_list.pop(0)

    return tuple(url_list)


# def __init__(self, q):
# #     self.query = q

# Comment this stuff out
if __name__ == "__main__":
    # Use this Unicode format though or it will break Google
    query = "massachusetts"

    lis = scraper(query)

    urls = (
        u'https://www.cnet.com/news/from-friday-to-pizza-rat-top-viral-videos-of-2010s/',
        u'http://www.bbc.co.uk/news/world-europe-30035666',
        u'http://www.bbc.co.uk/news/magazine-29631332')

    for url in urls:
        summaries = SummarizeUrl(url)
        pprint(summaries)
Exemplo n.º 15
0
# Add random articles
Urls = {
    'Huffington':
    'http://www.huffingtonpost.com/2013/11/22/twitter-forward-secrecy_n_4326599.html'
}
Urls.update({'BBC': 'http://www.bbc.co.uk/news/business-32817114'})
Urls.update({'BBC Tech': 'http://www.bbc.co.uk/news/technology-32814309'})
Urls.update({'BBC News': 'http://www.bbc.co.uk/news/uk-england-32822298'})

print('Summarising Websites\n\n')
# Loop through each site in the Dictionary
for Site in Urls:
    # Print the name of the Site in Use
    print('Website: %s') % (Site)
    # Use the SummarizeURL function
    summaries = SummarizeUrl(Urls[Site])
    # Print Each Sentence on a new line
    for sentence in summaries:
        print sentence
    print('\n')

################# Output #################

# Importing Modules
#
# Adding Websites
#
# Summarising Websites
#
# Website: BBC
# Separately, the Federal Reserve fined a sixth bank, Bank of America, $205m over foreign exchange-rigging.
Exemplo n.º 16
0
def getNews(rssDict, service, searchedImages):
    try:
        startTime = time.time()
        directory = "./data/" + service + '/'
        # create directory for service if doesnt yet exist
        if not os.path.exists(directory):
            os.makedirs(directory)
            print('created directory: %s' % directory)
        # iterate each feed in service -> fetch data -> write to temp file -> copy to actual file
        for key, value in rssDict.items():
            fileName = directory + key + "-write.json"
            # delete {category}-write.json file if one already exists
            if os.path.exists(fileName):
                os.remove(fileName)
                print('deleted existing file: %s' % fileName)
            feed = feedparser.parse(value)  #parse feed to get all the posts
            feedDict = {}
            feedCounter = 0
            # loop through posts in category
            for post in feed.entries[:20]:  #limit to 20 entries per feed
                imgUrl = "none"
                # caching enabled. this prevents asking google for images every-time
                if post.link in searchedImages:
                    imgUrl = searchedImages[post.link]
                    print('found image in cache for %s. done!' % post.link)
                else:
                    query = post.title.split()
                    query = '+'.join(query)
                    if (service == 'reuters'):
                        imgSearch = (
                            "https://ajax.googleapis.com/ajax/services/search/images?v=1.0&q="
                            + query)
                    else:
                        imgSearch = (
                            "https://ajax.googleapis.com/ajax/services/search/images?v=1.0&q="
                            + service + "+" + query)

                    imgSearchRequest = requests.get(imgSearch, verify=False)

                    if (imgSearchRequest.status_code == 200):  #on get success
                        imgSearchData = imgSearchRequest.json()
                        try:
                            getNextImg = 1
                            imgUrl = imgSearchData['responseData']['results'][
                                0]['url']
                            if (service == 'reuters'):
                                imgUrl = urllib2.unquote(imgUrl)

                            badBbcUrl = 'http://ichef.bbci.co.uk/news/660/media/images/80201000/png/_80201000_breaking_image_large-3.png'
                            if (service == 'bbc' and imgUrl == badBbcUrl):
                                imgUrl = imgSearchData['responseData'][
                                    'results'][1]['url']
                                getNextImg = 2

                            # check if select url is actually an image
                            # if not, choose the next url
                            if not 'image' in requests.get(
                                    imgUrl,
                                    verify=False).headers['content-type']:
                                print(
                                    "MISSED FIRST IMG URL = BAD CONTENT. SECOND FETCH!"
                                )
                                imgUrl = imgSearchData['responseData'][
                                    'results'][getNextImg]['url']
                            searchedImages[
                                post.
                                link] = imgUrl  # add to image cache if img found
                            print(
                                'image not in cache but new one fetched for %s. done!'
                                % post.link)
                        except (TypeError, IndexError,
                                requests.exceptions.MissingSchema):
                            print('DENIAL FROM GOOGLE for %s. failed!' %
                                  post.link)
                            imgUrl = "200F"
                    else:
                        imgUrl = "404"
                        print(
                            'image not in cache. also couldnt fetch new one for %s. failed!'
                            % post.link)

                summary = SummarizeUrl(
                    post.link)  # summarize text from article
                feedDict[feedCounter] = [
                    post.title, post.link, summary, imgUrl
                ]
                feedCounter += 1
            # write the collected data to {category}-write.json in json format
            with open(fileName, 'w') as fp:
                json.dump(feedDict, fp)
            print('wrote file: %s' % fileName)
        # iterate through all cateogries and copy temp files to the actual files
        for key, value in rssDict.items():
            source = directory + key + "-write.json"
            destination = directory + key + ".json"
            if os.path.exists(source):
                copyfile(source, destination)
                print('copied file: %s' % destination)
            else:
                print('cannot copy file: source %s not found' % source)
        print("--- %s seconds ---\n" %
              (time.time() - startTime))  #iteration runtime
    except Exception:
        print("EXCEPTION ERROR EXCEPTION ERROR!!")
        pass
Exemplo n.º 17
0
    len(monarchs) - len(featured) - 1) + ' features queued)'
to = 'test-email'

if production == True:
    to = 'Stevie Wonder Says <*****@*****.**>'
    subject = s

messageText = random.choice([
    "Today's subject is ", "This week's subject is ",
    "Today let's have a look at ", "Today's topic is ", "Today let's discuss "
])

messageText += s + '.<br /><br />Did you know...?<br /><br />'

imageUrls = im.getNImages(s, 5)
sentences = SummarizeUrl(wikipediaUrl)
if sentences is None:
    sentences = ['Little is known about ' + s + '.'] * 5

for i in xrange(5):
    messageText += sentences[i].encode('utf-8') + '<br /><br />'
    messageText += '<center><img style="max-height: 480px; max-width: 480px;" alt="one of the best pictures of ' + s + '"" src="' + imageUrls[
        i].encode('utf-8') + '" /></center><br /><br />'

messageText += "That's all for now. Have a great week!<br /><br />Long live the queen,<br />Steve"

# feel free to write back with feedback or suggestions on how to improve my newsletter

message = sendEmail.CreateHTMLMessage(sender, to, subject, messageText)
sendEmail.SendMessage(quickstart.gmail_service, 'me', message)
# End of get_json_files()

if __name__ == "__main__":
    args = parse_arguments()
    json_files = get_json_files(args.dataset)
    total_summaries = []

    # Summarizing from URL
    print("Summarizing from URLs")
    for current_file in [json_files[0]]:
        with open(current_file, 'r') as json_file:
            for line in json_file:
                record = json.loads(line)
                url = record['URL_s']
                print("Summarizing...", url)
                summary_sentences = SummarizeUrl(url)
                if summary_sentences:
                    total_summaries.append(" ".join(summary_sentences))
        print("Done processing one file")

    print("Finished first pass through all records")
    print("Recombining and summarizing...")
    while len(total_summaries) > 15:
        summaries_to_join = int(len(total_summaries) / 15)
        if summaries_to_join == 1:
            break
        if summaries_to_join > 20:
            summaries_to_join = 20
        combined_summaries = [
            " ".join(total_summaries[i:i + summaries_to_join])
            for i in range(0, len(total_summaries), summaries_to_join)
Exemplo n.º 19
0
def longSummaryFromQuery(query):
    g = d.get_zci(query, True)
    if "(" in g:
        return Summarize(query, g, 15)
    else:
        return SummarizeUrl(g, 15)
Exemplo n.º 20
0
#rss = 'https://www.incibe-cert.es/bfeed/avisos-sci/all'
b = float('0.0')
lenlista = len(listaulr)
for rss in listaulr :
    print((b/lenlista)*100)
    if hasattr(ssl, '_create_unverified_context'):
        ssl._create_default_https_context = ssl._create_unverified_context
    d = feedparser.parse(rss.replace('"','')) #<<WORKS!!
    if (len(d.entries) > 10) :
        ent = 10
    else:
        ent = len(d.entries)
    coda = Coda('c5d228f1-a38e-4956-b6e6-f6ceaac089d0')
    for i in range(0,int(ent)) :
        try:
            summaries = SummarizeUrl(d.entries[i].link)
            payload = {
                  'rows': [
                    {
                      'cells': [
                        {'column': 'c-_zKlSZgEHp', 'value': d.entries[i].title}, {'column': 'c-FaaJIgjDAb', 'value': d.entries[i].link}, {'column': 'c-MM-bPQUWUb', 'value': summaries},
                      ],
                    },
                  ],
                  "keyColumns" : [
                    "c-_zKlSZgEHp"
                  ]
            }
            print(payload)
            coda.upsert_row('G6ody-h41g','grid-30lv9fOPY1',payload)
        except:
Exemplo n.º 21
0
def wholeArticleFromQuery(query):
    g = d.get_zci(query, True)
    if "(" in g:
        return Summarize(query, g, 1500)
    else:
        return SummarizeUrl(g, 1500)
Exemplo n.º 22
0
def tease_url(url):
    summaries = SummarizeUrl(url)
    print(summaries)
Exemplo n.º 23
0
def summarize():
    url = urllib2.unquote(request.args.get('u') or '')
    summaries = SummarizeUrl(url)
    return jsonify(result=summaries)