示例#1
0
文件: main.py 项目: mausconi/saasify
def stylecloud(request: StyleCloudRequest):
    params = request.dict()
    url = params.pop("url", None)
    text = params.pop("text", None)
    background_color = params.pop("background_color", None)
    gradient = params.pop("gradient", None)

    if gradient == Gradient.none:
        gradient = None

    if url is not None:
        require_diffbot()
        # TODO: replace this with a more lightweight alternative than diffbot
        result = article(
            url,
            token=DIFFBOT_TOKEN,
            paging=False,
            discussion=False,
            maxTags=0,
            norender=True,
        )["objects"][0]
        pprint.pprint(result)
        text = result["text"]
    elif text is None:
        raise Exception('Must provide either "text" or "url".')

    sc.gen_stylecloud(**params,
                      text=text,
                      gradient=gradient,
                      icon_dir="/tmp/icons",
                      output_name=OUTPUT_NAME,
                      background_color=background_color.as_hex())

    return FileResponse(OUTPUT_NAME, media_type="image/png", headers=headers)
示例#2
0
def gen_keywords(url, dict, bank):

    # set up stuff
    subscription_key = "cdc7974745364f92b1f5e0b9fcd41cef"
    endpoint = "https://jimwu.cognitiveservices.azure.com/"
    keyphrase_url = endpoint + "/text/analytics/v3.0/keyphrases"
    headers = {"Ocp-Apim-Subscription-Key": subscription_key}

    # if url
    json_result = diffbot.article(url, token='d656578220cbf622d16575aba331d47d')
    text = json_result['objects'][0]['text']
    cleaned_text = clean(text)
    documents = chunk(cleaned_text)

    key_phrases = []
    # go through the different ones
    for batch in documents:

        # catch case where the batch is empty
        if not batch['documents']:
            break

        # make request to Azure API
        response = requests.post(keyphrase_url, headers=headers, json=batch)
        # use response
        doc_list = response.json()['documents']
        for doc in doc_list:
            key_phrases += (doc['keyPhrases'])
            for key in doc['keyPhrases']:
                bankKeys(bank, key)
                arr = key.split(' ')
                wordCount(dict, arr)

    return cleaned_text
示例#3
0
def get_keywords(url, dict, bank):
    subscription_key = "cdc7974745364f92b1f5e0b9fcd41cef"
    endpoint = "https://jimwu.cognitiveservices.azure.com/"
    keyphrase_url = endpoint + "/text/analytics/v3.0/keyphrases"

    # if url
    json_result = diffbot.article(url, token='d656578220cbf622d16575aba331d47d')
    text = json_result['objects'][0]['text']
    documents = chunk(clean(text))

    key_phrases = []
    count = 1
    for batch in documents:
        #print(count)
        count += 1

        headers = {"Ocp-Apim-Subscription-Key": subscription_key}
        if not batch['documents']:
            break
        response = requests.post(keyphrase_url, headers=headers, json=batch)
        doc_list = response.json()['documents']
        for doc in doc_list:
            key_phrases += (doc['keyPhrases'])
            for key in doc['keyPhrases']:
                bankKeys(bank, key)
                arr = key.split(' ')
                wordCount(dict, arr)
示例#4
0
import operator
from rake_nltk import Rake
from rake_nltk import Metric
import diffbot

f = open('finalList.txt', 'r', encoding='utf-8')
words = f.read()
f.close()

stop = words.split('\n')

r = Rake(ranking_metric=Metric.WORD_FREQUENCY)

url = 'https://www.cnn.com/2020/08/21/politics/peter-rafael-dzibinski-debbins-green-beret-russia/index.html'
urlNoNames = 'https://www.britannica.com/science/influenza'
json_result = diffbot.article(urlNoNames,
                              token='d656578220cbf622d16575aba331d47d')

words = (json_result['objects'][0]['text'])
r.extract_keywords_from_text(words)

result = r.get_ranked_phrases_with_scores()

print(result)
# Process with diffbot
# --------------------

#with open('urls-test.txt','r') as urls:
with open('urls.txt','r') as urls:
        for url in urls:
                try:
                        print "---> Processing URL with diffbot: ", url
			parsedurl=urlparse(url)
                        domain=parsedurl.netloc
                        newpath=parsedurl.path.replace('/','_').strip()
                        #filename="corpus-test-diffbot/"+domain+newpath+".txt"
                        filename="corpus-diffbot/"+domain+newpath+".txt"
                        file = open(filename, "w")
    			#json_result = dumps(diffbot.article(url, token='f3472e9233ba4070833dfffb0fb97660'))
    			json_result = dumps(diffbot.article(url, token='bcc855fd71b859791b2202d8297da1e3')) # new LAS token 11/2015
			file.write(json_result)
    			outfile.close()
		except:
                        print "**** ERROR processing URL with diffbot: ", url, sys.exc_info()[0]
		time.sleep(1)
'''
# --------------------
# Process with jusText
# --------------------
#with open('urls-test.txt','r') as urls:
with open('urls.txt','r') as urls:
        for url in urls:
                try:
			print "---> Processing URL with jusText: ", url
			page = urllib2.urlopen(url).read()
示例#6
0
import diffbot

json_result = diffbot.article(
    'http://www.topsprogram.ca/reading-like-writers/',
    token='d656578220cbf622d16575aba331d47d')

print(json_result['objects'])
print(json_result['objects'][0]['text'])