示例#1
0
    def __init__(self):

        self.getsoup = getsoup.GetSoup()
        self.namefinder = fetchnames.NameFinder()
        self.getmetadata = getmetadata.GetMetadata()
        self.connection = sqlite3.connect("verbstest.db")
        self.cursor = self.connection.cursor()
        self.cursor.execute("SELECT * FROM commonverbs")
        self.verbarray = []
        self.verbs = []
        result = self.cursor.fetchall()

        for i in result:
            self.verbarray.append(i)

        for i in self.verbarray:
            self.verbs.append(str(i[0]))
示例#2
0
def getScoopz(url):

    ## initialize packages

    phrasecues = cues.Cues()
    namefinder = fetchnames.NameFinder()

    articlenames = []

    scoopstrue = True

    newerstring = ""
    headline = "Headline"
    urlthing = url

    paras = getSoup(url)

    if paras:

        ## try to get the name of the publication from the URL

        publications = [
            "recode", "techcrunch", "bloomberg", "theinformation",
            "vanityfair", "mic", "venturebeat", "arstechnica", "motherboard",
            "ap", "fusion", "anandtech", "engadget", "latimes", "buzzfeed",
            "wsj", "theverge", "backchannel", "adage", "medium", "govinsider",
            "cnet", "reuters", "pcworld", "statnews"
        ]

        pubcap = {
            'recode': 'Recode',
            'techcrunch': 'TechCrunch',
            'bloomberg': 'Bloomberg',
            'theinformation': 'The Information',
            'vanityfair': 'Vanity Fair',
            'mic': 'Mic',
            'venturebeat': 'VentureBeat',
            'arstechnica': 'Ars Technica',
            'motherboard': 'Vice Motherboard',
            'ap': 'Associated Press',
            'fusion': 'Fusion',
            'anandtech': 'AnandTech',
            'engadget': 'Engadget',
            'latimes': 'Los Angeles Times',
            'buzzfeed': 'BuzzFeed',
            'wsj': 'The Wall Street Journal',
            'theverge': 'The Verge',
            'backchannel': 'Backchannel',
            'adage': 'Ad Age',
            'medium': 'Medium',
            'cnet': 'CNET',
            'reuters': 'Reuters',
            'pcworld': "PCWorld",
            "statnews": "STAT",
            "Not found": "Couldn't find publication."
        }

        pubsplit = urlthing.split("//")
        pubsplit = pubsplit[1].split(".")

        for i in pubsplit:
            if i in publications:
                publication = i
        try:
            publication = pubcap[publication]
        except:
            publication = "Not found."

        paras = str(paras).replace("Inc. ", "")

        ## Fetch the phrases we're looking for on a per-publicationb basis

        if publication in phrasecues.phrases:
            phrases = phrasecues.phrases[publication]
        else:
            phrases = phrasecues.phrases['General']

        array = str(paras).split("<p>")

        lastnames = namefinder.getLastNames(namefinder.getNameArray(array))

        newarray = []

        for i in array:
            for j in phrases:
                if j in i:
                    newarray.append(i)

        splitarray = []

        i = 0
        while i < len(newarray):
            s_array = newarray[i].split(". ")
            for j in s_array:
                for k in phrases:
                    if k in j:
                        splitarray.append(j + ". ")
            i += 1

        newerarray = []

        i = 0
        while i < len(splitarray):
            if splitarray[i] in newerarray:
                pass
            else:
                newerarray.append(splitarray[i])
            i += 1

        newarray = newerarray

        newstr = ""

        for i in newarray:
            newstr += "--- " + i + "\n"

        ## Remove codes

        newstr = newstr.replace("</p>, ",
                                "").replace("\u2019",
                                            "'").replace("\\xa0", " ")
        newstr = newstr.replace("\n\n.", "").replace('\u201c', "").replace(
            '\u201d', "").replace('\u2014', "")

        ## Remove links

        newstr = re.sub(r'<[^>]*>', '', newstr)

        if '"' in newstr:
            newstr = "No scoops/nuggets."

        if newstr != "":
            newerstring += newstr
        else:
            newerstring += "No scoops/nuggets"
            scoopstrue = False

        return ([headline, publication, newerstring, scoopstrue])
    else:
        return ('Broken')
示例#3
0
import urllib2
from bs4 import BeautifulSoup
import requests
import re
import getTechmemeUrls
import getscoopz
import cues
import getsoup
import fetchquotes
import fetchnames
import getmetadata

scoop = getscoopz.GetScoopz()
quotes = fetchquotes.GetQuotes()
names = fetchnames.NameFinder()
soup = getsoup.GetSoup()
metadata = getmetadata.GetMetadata()

# tmUrls = getTechmemeUrls.TechmemeUrls()
# for i in tmUrls.techmemeURLs:
# 	result = scoop.getScoopz(i[1])
# 	print result[0] + ": " + i[0]
# 	print result[1]

url = raw_input("Please enter a URL\n>")
data = soup.getSoup(url)
paras = metadata.getParas(data)

returner = quotes.getQuotes(paras)
for i in returner:
    print i
示例#4
0
    def __init__(self):

        self.getsoup = getsoup.GetSoup()
        self.namefinder = fetchnames.NameFinder()
        self.getmetadata = getmetadata.GetMetadata()