def stylecloud(request: StyleCloudRequest): params = request.dict() url = params.pop("url", None) text = params.pop("text", None) background_color = params.pop("background_color", None) gradient = params.pop("gradient", None) if gradient == Gradient.none: gradient = None if url is not None: require_diffbot() # TODO: replace this with a more lightweight alternative than diffbot result = article( url, token=DIFFBOT_TOKEN, paging=False, discussion=False, maxTags=0, norender=True, )["objects"][0] pprint.pprint(result) text = result["text"] elif text is None: raise Exception('Must provide either "text" or "url".') sc.gen_stylecloud(**params, text=text, gradient=gradient, icon_dir="/tmp/icons", output_name=OUTPUT_NAME, background_color=background_color.as_hex()) return FileResponse(OUTPUT_NAME, media_type="image/png", headers=headers)
def gen_keywords(url, dict, bank): # set up stuff subscription_key = "cdc7974745364f92b1f5e0b9fcd41cef" endpoint = "https://jimwu.cognitiveservices.azure.com/" keyphrase_url = endpoint + "/text/analytics/v3.0/keyphrases" headers = {"Ocp-Apim-Subscription-Key": subscription_key} # if url json_result = diffbot.article(url, token='d656578220cbf622d16575aba331d47d') text = json_result['objects'][0]['text'] cleaned_text = clean(text) documents = chunk(cleaned_text) key_phrases = [] # go through the different ones for batch in documents: # catch case where the batch is empty if not batch['documents']: break # make request to Azure API response = requests.post(keyphrase_url, headers=headers, json=batch) # use response doc_list = response.json()['documents'] for doc in doc_list: key_phrases += (doc['keyPhrases']) for key in doc['keyPhrases']: bankKeys(bank, key) arr = key.split(' ') wordCount(dict, arr) return cleaned_text
def get_keywords(url, dict, bank): subscription_key = "cdc7974745364f92b1f5e0b9fcd41cef" endpoint = "https://jimwu.cognitiveservices.azure.com/" keyphrase_url = endpoint + "/text/analytics/v3.0/keyphrases" # if url json_result = diffbot.article(url, token='d656578220cbf622d16575aba331d47d') text = json_result['objects'][0]['text'] documents = chunk(clean(text)) key_phrases = [] count = 1 for batch in documents: #print(count) count += 1 headers = {"Ocp-Apim-Subscription-Key": subscription_key} if not batch['documents']: break response = requests.post(keyphrase_url, headers=headers, json=batch) doc_list = response.json()['documents'] for doc in doc_list: key_phrases += (doc['keyPhrases']) for key in doc['keyPhrases']: bankKeys(bank, key) arr = key.split(' ') wordCount(dict, arr)
import operator from rake_nltk import Rake from rake_nltk import Metric import diffbot f = open('finalList.txt', 'r', encoding='utf-8') words = f.read() f.close() stop = words.split('\n') r = Rake(ranking_metric=Metric.WORD_FREQUENCY) url = 'https://www.cnn.com/2020/08/21/politics/peter-rafael-dzibinski-debbins-green-beret-russia/index.html' urlNoNames = 'https://www.britannica.com/science/influenza' json_result = diffbot.article(urlNoNames, token='d656578220cbf622d16575aba331d47d') words = (json_result['objects'][0]['text']) r.extract_keywords_from_text(words) result = r.get_ranked_phrases_with_scores() print(result)
# Process with diffbot # -------------------- #with open('urls-test.txt','r') as urls: with open('urls.txt','r') as urls: for url in urls: try: print "---> Processing URL with diffbot: ", url parsedurl=urlparse(url) domain=parsedurl.netloc newpath=parsedurl.path.replace('/','_').strip() #filename="corpus-test-diffbot/"+domain+newpath+".txt" filename="corpus-diffbot/"+domain+newpath+".txt" file = open(filename, "w") #json_result = dumps(diffbot.article(url, token='f3472e9233ba4070833dfffb0fb97660')) json_result = dumps(diffbot.article(url, token='bcc855fd71b859791b2202d8297da1e3')) # new LAS token 11/2015 file.write(json_result) outfile.close() except: print "**** ERROR processing URL with diffbot: ", url, sys.exc_info()[0] time.sleep(1) ''' # -------------------- # Process with jusText # -------------------- #with open('urls-test.txt','r') as urls: with open('urls.txt','r') as urls: for url in urls: try: print "---> Processing URL with jusText: ", url page = urllib2.urlopen(url).read()
import diffbot json_result = diffbot.article( 'http://www.topsprogram.ca/reading-like-writers/', token='d656578220cbf622d16575aba331d47d') print(json_result['objects']) print(json_result['objects'][0]['text'])