Пример #1
0
 def get_spider(self):
     spider_map = {
         1: KeKeSpider,  # CNN,
         2: FoxSpider,  # 福克斯
         3: KeKeArticleSpider,  # Article
         4: EN24Spider  # 24en
     }
     response = requests.get(url=self.url, headers=HEADERS)
     page_type = -1
     if response.status_code == 200:
         html = response.content.decode("utf8")
         title = BeautifulSoup(html, 'lxml').find("title")
         title = str(title)
         if title.count("福克斯") and title.count("新闻"):
             page_type = 2
         elif title.count("CNN") or title.count("cnn"):
             page_type = 1
         elif self.url.count("Article"):
             page_type = 3
         elif title.count("NPR") or title.count("npr"):
             page_type = 1
         elif self.url.startswith("https://www.24en.com/voa"):
             page_type = 4
         print(page_type)
         return spider_map.get(page_type)
     else:
         return None
Пример #2
0
def countKeywords(page, keywords):
	counts = list()
	counts = [0] * len(keywords)


	bodyText = BeautifulSoup(page).get_text()
	bodyText = re.sub('[.!?/(),;\'\"]', '', bodyText)
	
	wordsInBody = bodyText.split()
	for keyword in keywords:
		if ' ' in keyword:
			counts[keywords.index(keyword)] = bodyText.count(keyword)
		else:
			counts[keywords.index(keyword)] = wordsInBody.count(keyword)
	return counts
Пример #3
0
def parse_ria_json(path):
    with open(path, "r", encoding="utf-8") as r:
        # ria2020 parsing os commented
        # pat = '{\"text\": \"(.*)\", \"title\": \"(.*)\"}'
        for line in r:
            data = json.loads(line.strip())
            # data = re.search(pat, line.strip())

            # title = data.group(2).lower().strip()
            # clean_text = data.group(1).lower().replace('\xa0', ' ').replace('\n', ' ').strip()

            title = data["title"]
            text = data["text"]

            clean_text = BeautifulSoup(text, 'html.parser').text.replace(
                '\xa0', ' ').replace('\n', ' ')
            if not clean_text or not title or clean_text.count(
                    ' ') < 3 or title.count(' ') < 3:
                continue
            yield clean_text, title
Пример #4
0
        def states():
            state_select = self.get_state_select()
            state_select_option_values = [
                '%s' % o.get_attribute('value')
                for o
                in state_select.options[1:]
            ]

            for v in state_select_option_values:
                state_select = self.select_state_option(v)
                self.driver.page_source
                text=BeautifulSoup(self.driver.page_source, "html.parser").get_text()
                meta_prices=[]
                for keyword in keywords:
                    prices = []
                    counter=text.count(keyword)
                    for z in range(counter):
                        prices.append(text.rsplit(keyword, z+1)[1].splitlines()[0])
                    prices=[float(price) for price in prices]
                    meta_prices.append(prices)
                yield (state_select.first_selected_option.text,meta_prices)
Пример #5
0
        def states():
            state_select = self.get_state_select()
            state_select_option_values = [
                '%s' % o.get_attribute('value')
                for o
                in state_select.options[1:]
            ]

            for v in state_select_option_values:
                state_select = self.select_state_option(v)
                self.driver.page_source
                text=BeautifulSoup(self.driver.page_source, "html.parser").get_text()
                meta_prices=[]
                for keyword in keywords:
                    prices = []
                    counter=text.count(keyword)
                    for z in range(counter):
                        prices.append(text.rsplit(keyword, z+1)[1].splitlines()[0])
                    prices=[float(price) for price in prices]
                    meta_prices.append(prices)
                yield (state_select.first_selected_option.text,meta_prices)
 def nested(ScrappedPageStruct):
     content_bs = BeautifulSoup(ScrappedPageStruct.content,
                                'html.parser').prettify()
     return content_bs.count(column)
Пример #7
0
def process_text(text):
    mentions = text.count('@')
    hashtags = text.count('#')
    urls = len(find_urls(text))

    # Remove links
    text = ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split())

    # Remove mentions
    text = ' '.join(
        re.sub("(@[A-Za-z0-9^\w]+)", " ",
               text.replace('@ ', '@').replace('# ', '#')).split())

    # Replace hashtags with words
    if text.count('#') > 0:
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text.replace('#', ' ')))

    # Remove HTML tags
    text = BeautifulSoup(text).get_text()

    # Save content length (exluding links and mentions)
    length = len(text)

    # Remove punctuation symbols
    text = ' '.join(
        re.sub("[\.\,\¡\¿\!\?\:\;\-\=\*\(\)\[\]\"\'\“\_\+\”\%\/\‘\’]", " ",
               text).split())
    text = text.translate(remove_digits).translate(remove_punctuation)

    # Lower case to avoid case sensitive problems
    text = text.lower()

    # Replace emojis with names
    text = emoji.demojize(text)

    # Add space between emojis and other characters
    ind = -2
    for c in range(text.count(':')):
        ind = text.find(':', ind + 2)
        if c % 2 == 0:
            newLetter = ' :'
        else:
            newLetter = ': '
        text = "".join((text[:ind], newLetter, text[ind + 1:]))

    # Replace emoji names with spanish meaning
    result = []
    parts = text.split(' ')
    for part in parts:
        if part:
            if part[0] == ':':
                em = handle_emoji_tone(part)
                em = emoji_meaning(em)
                if em:
                    result.append(em)
            else:
                result.append(part)

    text = ' '.join(result)

    # Filter using NLTK library append it to a string
    word_tokens = word_tokenize(text)
    result = [w for w in word_tokens if not w in stop_words]
    text = ' '.join(result)

    # Check if text contains at least a word
    analysis = TextBlob(text)
    try:
        # Sentiment analysis
        eng = analysis.translate(to='en')
        sentiment = eng.sentiment
        polarity = sentiment.polarity
        subjectivity = sentiment.subjectivity

    except Exception as e:
        polarity = 0.0
        subjectivity = 0.0

    result = {
        'no_hashtags': [hashtags],
        'no_mentions': [mentions],
        'no_urls': [urls],
        'effective_length': [length],
        'polarity': [polarity],
        'subjectivity': [subjectivity]
    }

    return result
Пример #8
0
    return start


for i in range(1, 53):
    pagina = i
    url = "http://www.zimmo.be/nl/panden/?status=1&type%5B0%5D=5&hash=86c443f7d824f3e29fb8377485e15a2c&priceIncludeUnknown=1&priceChangedOnly=0&bedroomsIncludeUnknown=1&bathroomsIncludeUnknown=1&constructionIncludeUnknown=1&livingAreaIncludeUnknown=1&landAreaIncludeUnknown=1&commercialAreaIncludeUnknown=1&yearOfConstructionIncludeUnknown=1&epcIncludeUnknown=1&queryCondition=and&includeNoPhotos=1&includeNoAddress=0&onlyRecent=0&onlyRecentlyUpdated=0&isPlus=0&region=list&district=MzAYBcMGGKICbOIgHkIcVTcVHAAA&pagina={}#gallery".format(
        pagina)
    url1 = url
    request1 = requests.get(url1)
    #request2 = requests.get(url2, cookies = request1.cookies)

    table = BeautifulSoup(request1.text, "html.parser")
    table = str(table)
    #print(table)

    print(table.count("id=\"pand-"))
    stop = table.count("id=\"pand-")

    for x in range(1, stop):
        #print(table)
        #rows = table.findAll('class')[2::3]
        eerste = find_nth(table, "id=\"pand-", x)
        tweede = find_nth(table, "id=\"pand-", x + 1)
        list = table[eerste:tweede]
        #print(list)
        #list = table.find("div", attrs={"class":"item "})
        list = str(list)

        #id
        list1 = list.split('\n', 1)[0]
        id = list1[9:19]
Пример #9
0
for url in urls2:
    index = urls2.index(url)
    
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page)
    paraTitle = list(soup.find_all('h3'))
    
    for paragraph in paraTitle:
        para = str(paragraph.nextSibling.nextSibling)
        paraText = BeautifulSoup(para).get_text()
        paraText = paraText.strip()
        if (paraText != ""):
            capsCount = sum(1 for x in paraText if x.isupper())
            rowValue = pd.Series([companies2[index], paraText, len(paraText), 
                                  paraText.count(' '), capsCount])
            df = df.append(rowValue, ignore_index=True)
        
for url in urls3:
    index = urls3.index(url)
    
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page)
    para = list(soup.find_all('li'))
    
    for paragraph in para:
        paraText = paragraph.get_text()
        paraText = paraText.strip()
        if (paraText != ""):
            capsCount = sum(1 for x in paraText if x.isupper())
            rowValue = pd.Series([companies3[index], paraText, len(paraText), 
Пример #10
0
for row in soup.findAll(
        "row"):  # Run through each row and pick apart the call details
    call = row.findAll("cell")
    if len(call) == 6:  #If the row contains probably good call data
        calldate = str(call[0].get_text())
        calltype = str(call[5].get_text())
        address = str(call[2].get_text())
        address = address[0:address.rfind(',')]
        unitids = BeautifulSoup(call[3].get_text(), 'html.parser')
        unitids = str(unitids.get_text())
        unitids = unitids.replace("?", "")
        unitids = unitids.replace("^", "")
        latlong = str(call[4].get_text())
        lat = float(latlong.split(',')[0])
        long = float(latlong.split(',')[1])
        numunits = unitids.count(',') + 1
        callDesc = "[" + calltype + "] " + address + ": " + unitids + " (" + str(
            numunits
        ) + ") @ " + calldate + " http://maps.google.com/?q=" + str(
            lat) + "," + str(long)
        savedType, savedFooter = storeCall(
        )  #Store the call. Reutrn type is 0 for old, 1 for new, 2 for update call. Footer is a string of dbid & what (if any) changed
        if savedType == 1:  #If it is a new call
            sendTweet(callDesc)
        if savedType > 0:  #If the call is new or updated
            if numunits > 7 or any(
                    word in callDesc
                    for word in COOLCALL):  #Check if it should email
                sendEmail(savedFooter)
        elif savedType < 0:
            sendEmail("\nstoreCall() ERROR!!")
Пример #11
0
def get_keyword_frequency(html_string):
    text = BeautifulSoup(str(html_string), "html.parser").text
    if len(text) == 0:
        return 0
    return sum(text.count(kword) for kword in kwlist)
Пример #12
0
    Ppos = int(row[3])
    perA = row[4].split(" ")
    Apos = int(row[5])
    Aans = row[6]
    perB = row[7].split(" ")
    Bpos = int(row[8])
    Bans = row[9]
    url = row[10]

    #try to get information from given url. data from url is saved in Acount and Bcount
    for trial in range(3):
        try:
            html_content = requests.get(url, timeout=(3, 10)).text
            soup = BeautifulSoup(html_content, "lxml")
            soup = soup.prettify()
            Acount = soup.count(perA[0])
            Bcount = soup.count(perB[0])
            break
        except:
            Acount = 0
            Bcount = 0

    #find all position of preposition, nounA, and nounB.
    Pord = find_ord(row[1], prep[0])
    Aord = find_ord(row[1], perA[0])
    Bord = find_ord(row[1], perB[0])

    #make list with information that can be get from text for every word
    #[word, all positions of word, index for previous element to get wanted word]
    target = [prep, Pord, get_nth(row[1], prep[0], Ppos, Pord)]
    first = [perA, Aord, get_nth(row[1], perA[0], Apos, Aord)]
    busName = dataInJson["results"][k]["name"]
    busAdd = dataInJson["results"][k]["formatted_address"]
    busId = dataInJson["results"][k]["place_id"]
    url2 = urllib.request.urlopen(
        "https://maps.googleapis.com/maps/api/place/details/json?placeid=" +
        busId + '&key=' + api_key)
    data2 = url2.read().decode('utf-8')

    dataInJson2 = json.loads(data2)
    phoneNumber = dataInJson2["result"]
    phoneNumber = phoneNumber.get("formatted_phone_number")
    street = dataInJson2["result"]
    street = street.get('adr_address')
    street = BeautifulSoup(street).text

    commaCount = street.count(",")
    testAddress = re.search("^[0-9]\d", street)

    if testAddress and phoneNumber is not None and commaCount == 3:
        street = street.split(", ")
        streetName = street[0]
        City = street[1]
        State = street[2].split(" ")
        StateName = State[0]
        Zip = State[1]
        Zip = str(Zip)[:5]

        if StateName == "NC":
            new_row = {
                "Name": busName,
                "Business Name": busName,
Пример #14
0
    def query(self, q, a):
        t0 = time.time()
        ua = UserAgent()
        #header = {'User-Agent':str(ua.random)}

        s = requests.Session()
        url = 'https://www.google.com/search?q={}'.format(q)

        fpage_list = []
        total_res_list = []
        nlp_list = []
        #webbrowser.open(url)

        front_page = s.get(url)
        front_soup = BeautifulSoup(front_page.text, 'html.parser')

        for script in front_soup(["script", "style"]):
            script.extract()

        front_soup = front_soup.get_text().lower()

        for ans in a:
            # naive question + answer
            query = '{} "{}"'.format(q, ans)
            url = 'https://www.google.com/search?q={}'.format(query)
            r = s.get(url)

            soup = BeautifulSoup(r.text, 'html.parser')
            '''
      entities = self.nlp(q)

      nlp_query = " ".join(['"{}"'.format(entities[i].name) for i in range(len(entities))])
      nlp_query += '" {}"'.format(ans)
      url = 'https://www.google.com/search?q={}'.format(nlp_query)
      nlp_r = s.get(url)

      nlp_soup = BeautifulSoup(nlp_r.text, 'html.parser')
      for script in nlp_soup(["script", "style"]):
        script.extract()

      nlp_soup = nlp_soup.get_text().lower()
  '''
            page_count = front_soup.count(ans.lower())
            num_results = soup.find('div', {'id': 'resultStats'}).text

            num_results = [
                el for el in num_results.split(" ") if el[0].isdigit()
            ]
            #num_results = "".join([c for c in num_results if c.isdigit()])

            num_results = 0 if num_results == [] else int(
                num_results[0].replace(",", ""))

            #nlp_count = nlp_soup.count(ans.lower())
            print('{} -- {} -- {}'.format(ans, num_results, page_count))
            #nlp_count))

            fpage_list.append(page_count)
            total_res_list.append(num_results)
            #nlp_list.append(nlp_count)

        t1 = time.time()
        print('Time: {}'.format(t1 - t0))
        return {'total_res': total_res_list, 'front_page': fpage_list}
Пример #15
0
for url in urls2:
    index = urls2.index(url)
    
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page)
    paraTitle = list(soup.find_all('h3'))
    
    for paragraph in paraTitle:
        para = str(paragraph.nextSibling.nextSibling)
        paraText = BeautifulSoup(para).get_text()
        paraText = paraText.strip()
        if (paraText != ""):
            capsCount = sum(1 for x in paraText if x.isupper())
            rowValue = pd.Series([companies2[index], paraText, len(paraText), 
                                  paraText.count(' '), capsCount])
            df = df.append(rowValue, ignore_index=True)
        
for url in urls3:
    index = urls3.index(url)
    
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page)
    para = list(soup.find_all('li'))
    
    for paragraph in para:
        paraText = paragraph.get_text()
        paraText = paraText.strip()
        if (paraText != ""):
            capsCount = sum(1 for x in paraText if x.isupper())
            rowValue = pd.Series([companies3[index], paraText, len(paraText), 
Пример #16
0
# simple program to search for the number of occurrences of a word within any given webpage.
# built for my brother's sentiment analysis project, so that he could check through 500 news articles for the word count of a
# preset list of words, to determine emotional bias in the description.

import requests
from bs4 import BeautifulSoup

keywords = input(
    "List the keywords, separated by commas (without spaces).\n").split(",")

while True:
    page = input("Input the website URL: ")
    txt = BeautifulSoup(requests.get(page).text,
                        features="lxml").get_text().upper()
    for i in keywords:
        print(i + ": " + str(txt.count(i.upper())))
def get_keyword_frequency(html_string):
    text = BeautifulSoup(str(html_string), "html.parser").text
    if len(text) == 0:
        return 0
    return sum(text.count(kword) for kword in kwlist)
Пример #18
0
# 股东股本->分红配股

## http://quotes.money.163.com/f10/fhpg_000001.html#01d05

import requests
from bs4 import BeautifulSoup

url = 'http://quotes.money.163.com/f10/fhpg_000001.html#01d05'
wb_data = requests.get(url)
# print(wb_data.text)
# html = '''
# <ul>
#     <li class="item-0" name="one"><a href="www.baidu.com">baidu</a>
#     <li class="item-1" name="two"><a href="www.alibaba.com">alibaba</a>
# '''
# soup = BeautifulSoup(html,'html.parser')
# print(soup.li.a.string)
soup = BeautifulSoup(wb_data.text,"html.parser")
# print(soup)
path = '.inner_box'
soup = soup.select(path)
print(soup.select(" table"))
print(soup.count('thead'))


# print("####################################################################################")
#
# trrs = soup.select('tr')[2].select('td')
# print(trrs)
# for item in trrs:
#     print(item.text)
Пример #19
0
    posts = api.posts.get("hot",0)

    for post in posts:
        comments = api.comments.get(post.id)
        for comment in comments:
            if 20 < len(comment.text) < 100:
                striped_comment = BeautifulSoup(comment.text).text
                syllables = get_syllables(striped_comment)
                count = len(syllables)
                #5 7 5 notation
                if count == 17:
                    #check if it's ours
                    if comment.author == credentials.login:
                        break
                    #checking for minimum 3 words
                    if striped_comment.count(" ") >= 2:
                        haiku = haikufi(syllables)
                        if haiku:
                            print striped_comment.encode('utf-8')
                            print "http://pikabu.ru/story/empty_" + str(post.id) + "#comment_" + str(comment.id)
                            result = ''.join(haiku)

                            try:
                                with db:
                                    db.execute('''INSERT INTO comments(comment, post_id, comment_id)
                                      VALUES(?,?,?)''', (result, post.id, comment.id))
                                    api.comments.add(result, post.id, comment.id)
                            except sqlite3.IntegrityError:
                                print('Record already exists')

    db.close()
Пример #20
0
stocksplit = 0
dividend = 0
buyback = 0
sharerepurchase = 0
acquire = 0
acquisition = 0
bankrupt = 0

while case <= sufswitchcase:
	try:
		urllib.request.urlopen(url + str(case) + suffix2)
	except urllib.error.URLError:
		case += 1
	else:
		text = BeautifulSoup(urllib.request.urlopen(url + str(case) + suffix2)).get_text().lower()
		option = text.count("option")
		spindashoff = text.count("spin-off")
		spinoff = text.count("spinoff")
		spinspaceoff = text.count("spin off")	
		spundashoff = text.count("spun-off")
		spunoff = text.count("spunoff")
		spunspaceoff = text.count("spun off")
		insider = text.count("insider")
		divest = text.count("divest")
		fraud = text.count("fraud")
		call = text.count("call")
		put = text.count("put")
		pyramid = text.count("pyramid")
		mislead = text.count("mislead")
		misled = text.count("misled")
		merger = text.count("merger")
Пример #21
0
'''
    Simple script to read a web page and create a file with the corresponding
    formatted ("pretty") html. Example:
    
        ./pretty.py   http://google.com
'''

import sys
import requests
from bs4 import BeautifulSoup
import lxml

try:
    url = sys.argv[1]
except IndexError:
    url = input('Enter url : ')

print ('Retrieving: ', url)


source = requests.get(url)
pretty = BeautifulSoup(source.text, 'lxml').prettify()
print('Lines     : ', pretty.count('\n'))

with open('pretty.html', 'w') as f:
    f.write(pretty)
    print('pretty.html written')

exit()