Пример #1
0
    def create_json_node(self, page):

        if getattr(page, 'status', 'published') != 'published':
            return

        soup_title = BeautifulSoup(page.title.replace(' ', ' '), 'html.parser')
        page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '^')

        soup_text = BeautifulSoup(page.content, 'html.parser')
        page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '^')
        page_text = ' '.join(page_text.split())

        page_category = page.category.name if getattr(page, 'category', 'None') != 'None' else ''

        page_url = '.'
        if page.url:
            page_url = page.url if self.relative_urls else (self.siteurl + '/' + page.url)

        node = {'title': page_title,
                'text': page_text,
                'tags': page_category,
                'url': page_url,
                'loc': page_url} # changed from 'url' following http://blog.siphos.be/2015/08/updates-on-my-pelican-adventure/ (an update to Pelican made it not work, because the update (e.g., in the theme folder, static/tipuesearch/tipuesearch.js is looking for the 'loc' attribute.

        self.json_nodes.append(node)
Пример #2
0
    def crawl_web(self, time):                                # returns index, graph of inlinks
        print 'Starting crawl'
        t = clock()                                           # initial time
        while self.tocrawl and clock() - t < time:            # loop when len(tocrawl) > 0 and deltatime > tFull
            url = self.tocrawl.pop(0)                         # take first page from tocrawl
            if url not in self.crawled:                       # check if page is not in crawled
                self.current_page = url
                html = self.get_text(url)                     # gets contents of page
                if html != '':
                    try:
                        soup = BeautifulSoup(html, 'lxml')    # parse with lxml (faster html parser)
                    except:                                   # parse with html5lib if lxml fails (more forgiving)
                        soup = BeautifulSoup(html, 'html5lib') 
                    try:
                        text = str(soup.get_text()).lower()   # convert from unicode
                    except:
                        text = soup.get_text().lower()        # keep as unicode
                    #try:
                    #    title = soup.title.string
                    #except:
                    #    pass #do nothing
                    outlinks = self.get_all_links(soup)       # get links on page
                    self.pages[url] = (tuple(outlinks), text) # creates new page object
                    self.add_page_to_index(url)               # adds page to index
                    self.union(self.tocrawl, outlinks)        # adds links on page to tocrawl
                    self.crawled.append(url)                  # add the url to crawled

        print 'Crawl finished'
Пример #3
0
def create_a_beautiful_soup_object(html):
    """
    Try to create a BeautifulSoup object that has not an empty body
    If so try a different HTML parser.

    Args:
        html (string): HTML string of the email body to be sent.

    Returns:
        soup (BeautifulSoup object or None)
    """
    if not html:
        return None

    soup = BeautifulSoup(html, 'lxml', from_encoding='utf-8')

    if soup.get_text() == '':
        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')

        if soup.get_text() == '':
            soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8')

            if soup.get_text() == '':
                soup = BeautifulSoup(html, 'xml', from_encoding='utf-8')

                if soup.get_text == '':
                    soup = None

    return soup
Пример #4
0
    def create_json_node(self, page):

        if getattr(page, 'status', 'published') != 'published':
            return

        soup_title = BeautifulSoup(page.title.replace('&nbsp;', ' '), "html.parser")
        page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '&#94;')

        soup_text = BeautifulSoup(page.content, "html.parser")
        page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '&#94;')
        page_text = ' '.join(page_text.split())

        if not hasattr(page, 'tags'):
            page_tags = ''
        else:
            page_tags = " ".join([tag.name for tag in page.tags])

        page_url = self.siteurl + '/' + page.url

        node = {'title': page_title,
                'text': page_text,
                'tags': page_tags,
                'url': page_url}

        self.json_nodes.append(node)
Пример #5
0
def get_text(l1, l2):
    soup1 = BeautifulSoup(l1)
    # kill all script and style elements
    for script in soup1(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text1 = soup1.get_text()

    # break into lines and remove leading and trailing space on each
    lines1 = (line.strip() for line in text1.splitlines())
    # break multi-headlines into a line each
    chunks1 = (phrase.strip() for line in lines1 for phrase in line.split("  "))
    # drop blank lines
    text1 = '\n'.join(chunk for chunk in chunks1 if chunk)

    #print(text1.encode('utf-8'))

    soup2 = BeautifulSoup(l2)
    # kill all script and style elements
    for script in soup2(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text2 = soup2.get_text()

    # break into lines and remove leading and trailing space on each
    lines2 = (line.strip() for line in text2.splitlines())
    # break multi-headlines into a line each
    chunks2 = (phrase.strip() for line in lines2 for phrase in line.split("  "))
    # drop blank lines
    text2 = '\n'.join(chunk for chunk in chunks2 if chunk)

    #print(text2.encode('utf-8'))
    return text1 == text2
Пример #6
0
def dnsquery(dn):
	url = "https://jiexifenxi.51240.com/web_system/51240_com_www/system/file/jiexifenxi/get/?ajaxtimestamp=1526175925753"
	headers = {
		'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16'}
	params = {'q': '{}'.format(dn), 'type': 'a'}
	reqst = requests.post(url=url, headers=headers, params=params)
	content = reqst.content.decode('utf-8')
	bd = BeautifulSoup(content, 'html.parser')

	print('---[+]A record---')
	print(bd.get_text())

	print('---[+]MX record---')
	params2 = {'q': '{}'.format(dn), 'type': 'mx'}
	rest = requests.post(url=url, headers=headers, params=params2)
	content2 = BeautifulSoup(rest.content.decode('utf-8'), 'html.parser')
	print(content2.get_text())

	print('---[+]CNAME record---')
	params3 = {'q': '{}'.format(dn), 'type': 'cname'}
	rest2 = requests.post(url=url, headers=headers, params=params3)
	content3 = BeautifulSoup(rest2.content.decode('utf-8'), 'html.parser')
	print(content3.get_text())

	print('---[+]NS record---')
	params4 = {'q': '{}'.format(dn), 'type': 'ns'}
	rest3 = requests.post(url=url, headers=headers, params=params4)
	content4 = BeautifulSoup(rest3.content.decode('utf-8'), 'html.parser')
	print(content4.get_text())

	print('---[+]TXT record---')
	params5 = {'q': '{}'.format(dn), 'type': 'txt'}
	rest4 = requests.post(url=url, headers=headers, params=params5)
	content5 = BeautifulSoup(rest4.content.decode('utf-8'), 'html.parser')
	print(content5.get_text())
Пример #7
0
def get_travel_content(url):
    res = build_request(url)
    res_text = res.text
    try:
        soup = BeautifulSoup(res_text, 'lxml').find('div', {'class': 'vc_article'})
        img_list = []
        content = soup.get_text().replace('\r', '').replace('\n', ' ').replace('\xa0','')
        items = soup.find_all('div', {'class': 'add_pic _j_anchorcnt _j_seqitem'})
        for item in items:
            try:
                img_url = item.find('img').get('data-src')
            except:
                continue
            img_list.append(img_url)
        result = {
            'content': content,
            'images': img_list
        }
        return result
    except:
        soup=BeautifulSoup(res_text,'lxml').find('div',{'id':'pnl_contentinfo'})
        img_list = []
        content = soup.get_text().replace('\r', '').replace('\n', ' ').replace('\xa0','')
        items = soup.find_all('img')
        for item in items:
            try:
                img_url = item.get('data-src')
            except:
                continue
            img_list.append(img_url)
        result = {
            'content': content,
            'images': img_list
        }
        return result
Пример #8
0
 def get_nodes(self):
     from bs4 import BeautifulSoup
     from artgraph.node import Node, NodeTypes
     from artgraph.relationship import AssociatedActRelationship, MembershipRelationship, ArtistGenreRelationship
     
     relationships = []
     node = self.get_node()
     wikicode = self.get_wikicode(node.get_dbtitle())
     
     if wikicode:
         for t in wikicode.filter_templates():
             if t.name.matches('Infobox musical artist'):
                 db = self.get_artistgraph_connection()
                 cursor = db.cursor()
                 
                 # Fill in current node info
                 if t.has('birth_name'):
                     name_cleaner = BeautifulSoup(str(t.get('birth_name').value))
                     
                     while name_cleaner.ref:
                         name_cleaner.ref.extract()
                     
                     cursor.execute("UPDATE artist SET name = %s WHERE id = %s", (name_cleaner.get_text(), node.get_id()))
                         
                 if t.has('image'):
                     image_cleaner = BeautifulSoup(str(t.get('image').value))
                     image = image_cleaner.get_text()
                     
                     cursor.execute("UPDATE artist SET imageLocation = %s WHERE id = %s", (self.resolve_image(image), node.get_id()))
                 
                 db.commit()
                 db.close()
                     
                 if t.has('associated_acts'):
                     associated_acts = t.get('associated_acts')
                     
                     for w in associated_acts.value.filter_wikilinks():
                         relationships.append(AssociatedActRelationship(node, Node(str(w.title), NodeTypes.ARTIST)))
                 
                 if t.has('genre'):
                     genres = t.get('genre')
                      
                     for w in genres.value.filter_wikilinks():
                         relationships.append(ArtistGenreRelationship(node, Node(str(w.title), NodeTypes.GENRE)))
                         
                 if t.has('current_members'):
                     current_members = t.get('current_members')
                     
                     for w in current_members.value.filter_wikilinks():
                         relationships.append(MembershipRelationship(node, Node(str(w.title), NodeTypes.ARTIST), True))
                         
                 if t.has('past_members'):
                     current_members = t.get('past_members')
                     
                     for w in current_members.value.filter_wikilinks():
                         relationships.append(MembershipRelationship(node, Node(str(w.title), NodeTypes.ARTIST), False))
                         
                 break
             
     return relationships
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        con = lite.connect(DOCSET_PATH + '/Contents/Resources/docSet.dsidx')
        with con:

            cur = con.cursor()
            cur.execute("DROP TABLE IF EXISTS searchIndex")
            cur.execute("CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT)")

            items = []
            for a in hxs.select("//ol/li/a[@href[contains(., '.html')]]"):

                soup = BeautifulSoup(a.extract())
                path = soup.a['href']
                path = path[len('../'):]

                if path.endswith('-binding.html'):
                    type = "Binding"
                    try:
                        title = soup.code.get_text()
                    except Exception:
                        title = soup.get_text()
                else:
                    type = "Guide"
                    title = soup.get_text()

                cur.execute("INSERT INTO searchIndex(name, type, path) VALUES(?, ?, ?)", (title, type, path))
                # item = BindingItem()
                # item['title'] = title
                # item['path'] = path
                # items.append(item)

        return items
Пример #10
0
    def create_json_node(self, page):

        if getattr(page, 'status', 'published') != 'published':
            return

        soup_title = BeautifulSoup(page.title.replace('&nbsp;', ' '))
        page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '&#94;')

        soup_text = BeautifulSoup(page.content)
        page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '&#94;')
        page_text = ' '.join(page_text.split())

        if getattr(page, 'category', 'None') == 'None':
            page_category = ''
        else:
            page_category = page.category.name

        page_url = self.siteurl + '/' + page.url

        node = {'title': page_title,
                'text': page_text,
                'tags': page_category,
                'loc': page_url}

        self.json_nodes.append(node)
    def create_json_node(self, page):
        if getattr(page, 'status', 'published') != 'published':
            return

        soup_title = BeautifulSoup(page.title.replace('&nbsp;', ' '), 'html.parser')
        page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '&#94;')

        soup_text = BeautifulSoup(page.content, 'html.parser')
        page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '&#94;')
        page_text = ' '.join(page_text.split())

        if getattr(page, 'category', 'None') == 'None':
            page_category = ''
        else:
            page_category = page.category.name

        page_url = self.siteurl + os.path.dirname(page.source_path.split("content")[-1]) + "/" +  page.url
        #print ">>>", page_url

        node = {'title': page_title,
                'text': page_text,
                'tags': page_category,
                'url':  page_url}

        self.json_nodes.append(node)
 def parsePost(self,response):
     logging.info(response)
     sel = Selector(response)
     posts = sel.xpath('//*[@id="posts"]/li')
     items = []
     topic = sel.css('.threadtitle').xpath('./a/text()').extract()[0]
     condition="Carcinoid Cancer"
     url = response.url
     for post in posts:
         item = PostItemsList()
         item['author'] = post.xpath('.//a[contains(@class, "username")]/strong/text()').extract()[0].strip()
         item['author_link']=response.urljoin(post.xpath('.//a[contains(@class, "username")]/@href').extract()[0])
         date = post.css('.postdate').extract()[0]
         soup = BeautifulSoup(date, 'html.parser')
         date=re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',soup.get_text()).strip()
         item['condition']=condition
         item['create_date']=date
         post_msg=post.css('.postcontent').extract()[0]
         soup = BeautifulSoup(post_msg, 'html.parser')
         post_msg = re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',soup.get_text()).strip()
         item['post']=post_msg
         item['tag']=''
         item['topic'] = topic
         item['url']=url
         logging.info(post_msg)
         items.append(item)
     return items
Пример #13
0
 def fetch_status(self):
     submissionId = self.submissionId
     while True:
         r = urlopen(urls.STATUS_URL,
                     data=urlencode(dict(
                         ids=submissionId
                     )))
         data = json.loads(r.read())
         data = data[0]
         final = data['final']
         if final == '1':
             print '\r\x1b[KResult: %s' % data['status_description'].strip()
             print 'Memory: %s' % data['mem'].strip()
             # Fixed no markup specified warning. If not specified it uses
             # best for the system but it can then behave differently for
             # different system
             soup = BeautifulSoup(data['time'],"lxml")
             time_taken = soup.get_text()
             print 'Time: %s' % time_taken.strip()
             if "accepted" in data['status_description']:
                 prob_db=utils.get_problem_database()
                 prob_db[self.problem]['solved']=True
                 utils.set_problem_database(prob_db)
             break
         else:
             soup = BeautifulSoup(data['status_description'],"lxml")
             string = soup.get_text().strip()
             string = string.replace('\t', '')
             string = string.replace('\n', '')
             sys.stdout.write('\r\x1b[KStatus: %s' % string)
             sys.stdout.flush()
         time.sleep(0.5)
Пример #14
0
def get_lyrics(song,band):

    # Constructing the url to fetch lyrics from

    lyrics_url = "http://www.azlyrics.com/lyrics/" + band.replace(" ","") + "/" + song.replace(" ", "") + ".html"

    #print lyrics_url + "\n"

    try:

        # Open and read the page

        page = urlopen(lyrics_url)
        html = page.read()

        #Find the starting and ending indices for the lyrics

        startindex = html.find("<!-- start of lyrics -->")
        endindex   = html.find("<!-- end of lyrics -->")

        # Slicing to get the lyrics

        lyrics = html[startindex:endindex]

        # Soupifying the page for better display

        soup = BeautifulSoup(lyrics)
        print "\nHere is the lyrics for " + song.upper() + " by " + band.upper() + "\n"
        print soup.get_text()

    except:

        # Printing error message
        print "\nSorry " + song.upper() + " by " + band.upper() + " NOT FOUND\n"
    def remove_tags(p, p2):
        # print "remove_tags p++++++++++++++++", repr(p), "remove_tags p2###################", repr(p2)
        # p=str(p)

        soup = BeautifulSoup(p2, 'html.parser')
        print "++++++++++++++++ soup removed tags: ", soup.get_text()
        return soup.get_text()
Пример #16
0
def getajax(url):
    if not pattern.match(url):
        url = 'http://' + url
    try:
        browser.get(url)
        n = browser.page_source
        soup = BeautifulSoup(n)
        n = soup.get_text()
        try:
            n = n.encode('utf-8')
        except:
            d = chardet.detect(n)
            n = n.decode(d['encoding']).encode('utf-8')
    except TimeoutException:
        n = browser.page_source
        soup = BeautifulSoup(n)
        n = soup.get_text()
        try:
            n = n.encode('utf-8')
        except:
            d = chardet.detect(n)
            n = n.decode(d['encoding']).encode('utf-8')
        n = 'TIMEDOUT'+n
    except WebDriverException as error:
        if 'MALFORMED_URI' in error.msg:
            n = 'MALFORMED_URI'
        else:
            raise error
    except Exception, error:
        raise error
Пример #17
0
def experience():
    # Reading the Data

    train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
    print("train.shape: ", train.shape)
    print("train.columns.values: ", train.columns.values)
    print("print(train['sentiment'][0]): ", train["sentiment"][0])
    print("\nprint(train['review'][0]): ", train["review"][0])

    # Data Cleaning and Text Preprocessing

    # Initialize the BeautifulSoup object on a single movie review     
    example1 = BeautifulSoup(train["review"][0], "lxml")
    print("\nBeautifulSoup(train['review'][0]): ", example1.get_text())

    # Use regular expressions to do a find-and-replace
    letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                        " ",                   # The pattern to replace it with
                        example1.get_text() )  # The text to search
    print("\nletters_only: ", letters_only)

    lower_case = letters_only.lower()        # Convert to lower case
    words = lower_case.split()               # Split into words
    print("found {0} words".format(len(words)))

    # removed, because size of nltk data (>3.7GB)
    # import nltk
    # # nltk.download()  # Download text data sets, including stop words
    # from nltk.corpus import stopwords # Import the stop word list
    # print("stopwords.words: ", stopwords.words("english"))

    stopwords =  ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
    # Remove stop words from "words"
    words = [w for w in words if not w in stopwords]
    print("\n all words:\n", words)    
Пример #18
0
def store_feed(e):
    """
    store a single entry from the feedparser
    :param e: the entry
    :return: if succeed the stored key else None
    """
    query = WebResource.query().filter(WebResource.url == e["link"])
    if query.count() == 0:
        print "STORING: " + e["link"]
        try:
            if 'summary' in e:
                s, t = BeautifulSoup(e['summary'], "lxml"), BeautifulSoup(e['title'], "lxml")
                e['summary'], e['title'] = s.get_text(), t.get_text()
            else:
                t = BeautifulSoup(e['title'], "lxml")
                e['summary'], e['title'] = None , t.get_text()
            k = WebResource.store_feed(e)
            print "STORED: " + str(k)
            return k
        except Exception as e:
            print "Cannot Store: " + str(e)
            return None
    else:
        print "Resource already stored"
        return None
Пример #19
0
    def getTaggedBlog(self, tag):
        ''' Return the tagged blogs's captions or post.'''

        tagged_uri = "http://api.tumblr.com/v2/tagged?tag=" + tag + "&api_key=" + \
            self.consumer_key + "&limit=20"
        req = requests.get(tagged_uri)
        jsonlist = json.loads(req.content)
        body = jsonlist['response']

        tagtext = []

        for blog in body:
            #print "####"
            for data in blog:
                #post
                if data == "body":
                    if blog[data]:
                        #print blog[data]
                        soup = BeautifulSoup(blog[data])
                        text = soup.get_text()
                        tagtext.append(text)
                #an image
                if data == "caption":
                    if blog[data]:
                        #print blog[data]
                        soup = BeautifulSoup(blog[data])
                        text = soup.get_text()
                        tagtext.append(text)

        return tagtext
Пример #20
0
class GenericUrl:

    def __init__(self, url):
        self.url = url

        req = requests.get(url)
        print req.status_code
        print req.headers
        print req.encoding
        html_doc = req.text
        self.soup = BeautifulSoup(html_doc, 'html.parser')

    def title(self):
        print '*** Title:', self.soup.title

    def hrefs(self):
        print '*** hrefs:'
        for link in self.soup.find_all('a'):
            print ' ', link.get('href')

    def text(self):
        print '*** Text:'
        print self.soup.get_text()   # print page text

    def kill_scripts(self):
        # remove script elements
        [s.extract() for s in self.soup('script')]
Пример #21
0
def indexpage_off(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    soup.get_text()
    es = Elasticsearch()
    es.index(index="bc", doc_type='webpage', body={"timestamp": datetime.now(),"text":soup.get_text(),"url":url})
    return True
Пример #22
0
    def build_data(self, page):

        if getattr(page, 'status', 'published') != 'published':
            return

        soup_title = BeautifulSoup(page.title.replace('&nbsp;', ' '))
        page_title = soup_title.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('^', '&#94;')

        soup_text = BeautifulSoup(page.content)
        page_text = soup_text.get_text(' ', strip=True).replace('“', '"').replace('”', '"').replace('’', "'").replace('¶', ' ').replace('^', '&#94;')
        page_text = ' '.join(page_text.split())

        if getattr(page, 'category', 'None') == 'None':
            page_category = ''
        else:
            page_category = page.category.name

        page_url = self.siteurl + '/' + page.url

        page_time = getattr(page, 'date', datetime(1970, 1, 1, 1, 0)).strftime('%s')

        # There may be possible collisions, but it's the best I can think of.
        page_index = abs(zlib.crc32(page_time + page_url))

        return {'title':  page_title,
                'author': page.author,
                'tags': page_category,
                'url': page_url,
                'content': page_text,
                'slug': page.slug,
                'time': page_time,
                'index': page_index,
                'summary': page.summary}
Пример #23
0
    def create_json_node(self, page):

        if getattr(page, "status", "published") != "published":
            return

        soup_title = BeautifulSoup(page.title.replace("&nbsp;", " "), "html.parser")
        page_title = (
            soup_title.get_text(" ", strip=True)
            .replace("“", '"')
            .replace("”", '"')
            .replace("’", "'")
            .replace("^", "&#94;")
        )

        soup_text = BeautifulSoup(page.content, "html.parser")
        page_text = (
            soup_text.get_text(" ", strip=True)
            .replace("“", '"')
            .replace("”", '"')
            .replace("’", "'")
            .replace("¶", " ")
            .replace("^", "&#94;")
        )
        page_text = " ".join(page_text.split())

        if getattr(page, "category", "None") == "None":
            page_category = ""
        else:
            page_category = page.category.name

        page_url = self.siteurl + "/" + page.url

        node = {"title": page_title, "text": page_text, "tags": page_category, "url": page_url}

        self.json_nodes.append(node)
Пример #24
0
def example(request):
    url="https://es.wikipedia.org/wiki/Parten%C3%B3n"

    req = urllib.request.Request(url)
    print("check ")
    response = urllib.request.urlopen(req)
    print("check ")
    the_page = response.read()


    #webread=urlopen('/acilveti92.pythonanywhere.com/hello/')
    print("problem ")

    soup = BeautifulSoup(the_page)

    texts = soup.get_text()
    text = soup.get_text()
    splittext=texts.split()
    simpletext=set(splittext)   #deletes after seeing words reduction
    print("comparation")
    print(len(splittext))
    print(len(simpletext))

    print("extract now")

    for elem in soup.find_all(['script', 'style']):
        elem.extract()

    print("empieza el texto")

    texts = soup.get_text()
    print("acaba el texto")

    splittext=texts.split()

    session_key = request.session._session_key
    #print("the session key is")
    #print(request.session._session_key)

    session = Session.objects.get(session_key=session_key)
    uid = session.get_decoded().get('_auth_user_id')
    click_user = User.objects.get(pk=uid)
    #print("the user is")
    #print(click_user)

    print(len(splittext))
    print("here comes the boom")
    print(splittext[0])
#now there should be made a duplicate avoider algorithm to make faster the replacement

    #first filter non alfanumeric letters
    simpletext=set(splittext)   #deletes duplicates items
    print("comparation")
    print(len(splittext))
    print(len(simpletext))
    WordNumber=len(simpletext)

    return HttpResponse(text)
    def parsePost(self,response):
        logging.info(response)
        sel = Selector(response)
        posts = sel.xpath('//div[contains(@class, "exchange_thread_reply_rdr")]')
        items = []
        if len(sel.xpath('//div[contains(@class, "first_item_title_fmt")]'))==0:
            return items
        topic = sel.xpath('//div[contains(@class, "first_item_title_fmt")]/text()').extract()[0]
        url = response.url
        condition="hiv"
        post = sel.xpath('//div[contains(@class, "firstitem_mid_fmt")]')
        item = PostItemsList()
        if len(post.css('.post_hdr_fmt').xpath('./a'))>0:
            item['author'] = post.css('.post_hdr_fmt').xpath("./a").xpath("text()").extract()[0].strip()
            item['author_link']=response.urljoin(post.css('.post_hdr_fmt').xpath("./a/@href").extract()[0])
        else:
            item['author'] = ""
            item['author_link']=""
        date = post.css('.first_posted_fmt').extract()[0]
        date = date[date.find('DateDelta')+11:date.rfind("'")]
        item['condition'] = condition
        item['create_date'] = date
        post_msg=post.css('.post_fmt').extract()[0]
        soup = BeautifulSoup(post_msg, 'html.parser')
        post_msg = re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',soup.get_text()).strip()
        item['post']=post_msg
        item['tag']=''
        item['topic'] = topic
        item['url']=url
        logging.info(post_msg)
        items.append(item)

        for post in posts:
            item = PostItemsList()
            if len(post.css('.post_hdr_fmt'))==0:
                continue
            if len(post.css('.post_hdr_fmt').xpath('./a'))>0:
                item['author'] = post.css('.post_hdr_fmt').xpath("./a").xpath("text()").extract()[0].strip()
                item['author_link']=response.urljoin(post.css('.post_hdr_fmt').xpath("./a/@href").extract()[0])
            else:
                item['author'] = ""
                item['author_link']=""
            date = post.css('.posted_fmt').extract()[0]
            date = date[date.find('DateDelta')+11:date.rfind("'")]
            item['condition'] = condition
            item['create_date'] = date
            post_msg=post.css('.post_fmt').extract()[0]
            soup = BeautifulSoup(post_msg, 'html.parser')
            post_msg = re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',soup.get_text()).strip()
            item['post']=post_msg
            item['tag']='hiv'
            item['topic'] = topic
            item['url']=url
            logging.info(post_msg)
            items.append(item)
        return items
def scrape_supost(keywords, days_to_check, logfile_path,
                  previous_logfile_contents):
    """Scrapes supost.com to find all posts which contains the given keywords.

    Args:
        keywords (list of str): keywords to search for
        days_to_check (int): number of days back to search
        logfile_path (str): path of log file
        previous_logfile_contents (str): contents of the old log file

    Returns:
        list of str, int: list of new matches, posts searched
    """
    oldest_date = (datetime.date.today() -
                   datetime.timedelta(days=days_to_check))
    oldest_date_str = oldest_date.strftime("%a, %b %d")

    offset = 0
    new_matches = []
    h = httplib2.Http(".cache")
    link = "http://supost.com/search/index/5"

    is_scraping = True
    while is_scraping:
        response, content = h.request(link)
        link_page = BeautifulSoup(content)
        for link in link_page.find_all("a"):
            if ("post/index" in str(link.get("href"))):
                response, content = h.request("http://supost.com" +
                                              str(link.get("href")))
                post_page = BeautifulSoup(content)
                for keyword in keywords:
                    if keyword in str(post_page.get_text()).lower():
                        post_title = post_page.find("h2",
                                                    {"id": "posttitle"}).text
                        output_string = (post_title + ": supost.com" +
                                         link.get("href"))
                        if output_string in previous_logfile_contents:
                            return new_matches, offset
                        else:
                            new_matches.append(output_string)

        # stops scraper when oldest date is found
        if (oldest_date_str in str(link_page.get_text())):
            return new_matches, offset

        # makes sure scraper doesn't go too far
        if (offset + OFFSET_INCREASE > (OFFSET_INCREASE*2*days_to_check)):
            return new_matches, offset

        offset = offset + OFFSET_INCREASE

        # updates the link with the new offset
        link = "http://supost.com/search/index/5?offset=" + str(offset)

    return new_matches, offset
Пример #27
0
	def edit_distance(self):
		import Levenshtein

		srcSoup = BeautifulSoup(self.source_text(), 'html5lib')
		src_content_text = srcSoup.get_text().replace('\n', '').replace('\r', '').replace('   ', '').replace('  ', '').replace(u' ', '')

		dstSoup = BeautifulSoup(self.finish, 'html5lib')
		dst_content_text = dstSoup.get_text().replace('\n', '').replace('\r', '').replace('   ', '').replace('  ', '').replace(u' ', '')

		return Levenshtein.distance(src_content_text, dst_content_text)
Пример #28
0
def edit_distance(src, dst, encoding='utf-8'):
	with codecs.open(src, 'r', encoding=encoding) as srcFile:
		src_content = srcFile.read()
	src_content = '<p>' +src_content.replace('\r\n', '</p>\r\n<p>') +'</p>'
	srcSoup = BeautifulSoup(src_content, 'html5lib')
	src_content_text = srcSoup.get_text().replace('\n', '').replace('\r', '').replace('   ', '').replace('  ', '').replace(u' ', '')
	with codecs.open(dst, 'r', encoding=encoding) as dstFile:
		dst_content = dstFile.read()
	dstSoup = BeautifulSoup(dst_content, 'html5lib')
	dst_content_text = dstSoup.get_text().replace('\n', '').replace('\r', '').replace('   ', '').replace('  ', '').replace(u' ', '')
	return Levenshtein.distance(src_content_text, dst_content_text)
Пример #29
0
def printClasses():
   class_dict = getClasses()
   for class_id in class_dict:
      print 'Assignments for %s: \n\n' % class_dict[class_id]
      URL = 'https://canvas.instructure.com/api/v1/courses/%d/assignments' % class_id
      assignments = requests.get(URL, headers={'Authorization': 'Bearer %s' % TOKEN}).json()
      for arr in assignments:
         if (arr['description'] != ""):
            soup = BeautifulSoup(arr['description'], 'html.parser')
            dateReg = '(([0-9]{4})-([0-9]{2})-([0-9]{2}).*)'
            date = re.search(dateReg, arr['due_at'])
            print soup.get_text() + '- Due at: ' + '%s-%s-%s' % (date.group(3), date.group(4), date.group(2)) + '\n\n\n'
Пример #30
0
def get_transcript_soup(slug):
	"""Returns the html elements based on given slug."""
	url = 'http://www.ted.com/talks/' + slug + "/transcript?language=en"

	content = urllib2.urlopen(url)
	soup = BeautifulSoup(content, "html.parser")
	
	#CHECK ME!
	soup.prettify() #turn a BS parse tree into a nicely formatted Unicode string
	soup.get_text() #gets only the text within elements, can probably be cancelled
	
	return soup
Пример #31
0
 def _read_source(imap_host, imap_port, imap_user, imap_pass, imap_folder,
                  email_inreply):
     source = {'alreadyLoaded': False}
     try:  ## Time to search for the original email
         try:
             if "gmail" in imap_host:  # gmail server requires an ssl connection
                 print("gmail server")
                 imap = IMAP4_SSL(imap_host, imap_port)
             else:  # tls is preferred
                 imap = IMAP4(imap_host, imap_port)
                 imap.starttls()
             ## login to server
             #print(imap_user, imap_pass)
             imap.login(imap_user, imap_pass)
         except:
             print("Failed to login")
             return False
         if "gmail" in imap_host:
             imap.select('"[Gmail]/Sent Mail"')  # connect to sent mail.
             #print("Opening gmail 'Sent'")
         else:
             imap.select('Sent')  # connect to sent mail.
             #print("Opening 'Sent'")
         # Search for the original email ID
         messages = imap.search(None, 'HEADER', 'MESSAGE-ID', email_inreply)
         # Process the result to get the message id’s
         messages = messages[1][0].split()
         # Use the first id to view the headers for a message
         result, source_data = imap.fetch(messages[0], '(RFC822)')
         raw_source = source_data[0][
             1]  # here's the body, which is raw headers and html and body of the whole email
         s = email.message_from_bytes(
             raw_source)  # convert to message object
         source_subject = s['subject']
         source['date'] = s['Date']
         source['bcc'] = s['bcc']  #.split(',')
         source['msg_id'] = s['Message-ID']
         #print("BCC from source: ", source_bcc)
         source_body = s.get_payload()
         if s.is_multipart():  # search for text in the body
             for part in s.walk():
                 ctype = part.get_content_type()
                 cdispo = str(part.get('Content-Disposition'))
                 if ctype == ('text/plain' or
                              'text/html') and 'attachment' not in cdispo:
                     source_body = part.get_payload()
                     #print(email_body)
                     break
         #print(frm, " Sent a reply to: ", source_subject)
         self.get_parent().msgIsReply = True
         src_sub = BeautifulSoup(source_subject, 'html.parser')
         try:  # extra check for encryption (in case user has encypted email)
             src_body = BeautifulSoup(source_body, 'html.parser')
         except:  # if email is encrypted it will throw an exception
             src_body = encription_warning
         source['subject'] = src_sub.get_text()
         source['body'] = src_body.get_text()
         return source
     except:
         print("no origin found")
         return False
Пример #32
0
def soupify(webtext):
    soup = BeautifulSoup(webtext)
    scraped = soup.get_text()
    return scraped.encode('utf-8')
Пример #33
0
 def _read_mail(imap_host, imap_port, imap_user, imap_pass, imap_folder,
                eNum):  # reads the most recent email and parses the text
     ### Reading emails from the server. The bulk of the logic is here
     ### We prosses an email, clean up the text, check if it is a reply
     ### If the message is a reply, search for the original email in the sent box
     ### If the original email exists, run a search on the inbox for all emails replying to the original
     ### And finally, check for and load images
     global ids_list
     if eNum == -1:
         ids_list = []
     email_recieved = {'alreadyLoaded': False}
     try:
         if "gmail" in imap_host:  # gmail server requires an ssl connection
             print("gmail server")
             imap = IMAP4_SSL(imap_host, imap_port)
         else:  # tls is preferred
             imap = IMAP4(imap_host, imap_port)
             imap.starttls()
         ## login to server
         print(imap_user, imap_pass)
         imap.login(imap_user, imap_pass)
     except:
         print("Failed to login")
         return False
     #print(imap.list()) # for identifying mailboxes on the server
     imap.select("Inbox")  # connect to all mail.
     result, data = imap.uid('search', None,
                             "ALL")  # search and return uids instead
     ids = data[0]  # data is a list.
     id_list = ids.split()  # ids is a space separated string
     current_email_uid = data[0].split()[eNum]
     #print(current_email_uid)
     result, data = imap.uid(
         'fetch', current_email_uid, '(RFC822)'
     )  # fetch the email headers and body (RFC822) for the given ID
     raw_email = data[0][
         1]  # here's the body, which is raw headers and html and body of the whole email
     b = email.message_from_bytes(raw_email)
     email_recieved['msg_id'] = b['Message-ID']
     #print("printing id", msg_id, ids_list)
     for i in ids_list:
         if i == email_recieved['msg_id']:
             print("mail already loaded")
             email_recieved['alreadyLoaded'] = True
             #self.get_parent()._already_loaded()
             return
     ids_list.append(email_recieved['msg_id'])
     email_from = b['from']
     email_subject = b['subject']
     email_recieved['date'] = b['Date']
     email_recieved['inreply'] = b['in-reply-to']
     email_recieved['refs'] = b['references']
     email_body = b.get_payload()
     if b.is_multipart():  # search for text in the body
         for part in b.walk():
             ctype = part.get_content_type()
             cdispo = str(part.get('Content-Disposition'))
             if ctype == ('text/plain'
                          or 'text/html') and 'attachment' not in cdispo:
                 email_body = part.get_payload()
                 #print(email_body)
                 break
     # Use beautifulsoup to get readable text
     frm = BeautifulSoup(email_from, 'html.parser')
     sub = BeautifulSoup(email_subject, 'html.parser')
     try:  # Try parsing the body text
         body = BeautifulSoup(email_body, 'html.parser')
     except:  # if email is encrypted it will throw an exception
         email_recieved['body'] = encription_warning
     email_recieved['from'] = frm.get_text()
     email_recieved['subject'] = sub.get_text()
     #find just the email address
     add1 = re.findall("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)",
                       str(frm))
     email_recieved['email'] = add1[0]
     if body != encription_warning:
         email_recieved['body'] = body.get_text()
     return email_recieved
     '''
Пример #34
0
 def find_replies(imap_host, imap_port, imap_user, imap_pass, imap_folder,
                  email_inreply):
     try:
         # On to find more emails that may be replies
         replies_list = []
         try:
             if "gmail" in imap_host:  # gmail server requires an ssl connection
                 print("gmail server")
                 imap = IMAP4_SSL(imap_host, imap_port)
             else:  # tls is preferred
                 imap = IMAP4(imap_host, imap_port)
                 imap.starttls()
             ## login to server
             #print(imap_user, imap_pass)
             imap.login(imap_user, imap_pass)
         except:
             print("Failed to login")
             return False
         imap.select("Inbox")
         replies = imap.search(None, 'HEADER', 'IN-REPLY-TO', email_inreply)
         # BODY.PEEK[HEADER.FIELDS (SUBJECT)]
         print("searched inbox for ", email_inreply)
         # Process the result to get the message id’s
         replies = replies[1][0].split()
         print("got list of replies")
         # Use the first id to view the headers for a message
         replies.reverse()
         for i in replies:
             reply = {}
             print("Checking list of replies")
             result, reply_data = imap.fetch(i, '(RFC822)')
             print("loaded a reply")
             raw_reply = reply_data[0][
                 1]  # here's the body, which is raw headers and html and body of the whole email
             #print("raw reply")
             r = email.message_from_bytes(
                 raw_reply)  # convert to message object
             #reply_to = r['in-reply-to']
             reply['refs'] = r['references']
             print("references", reply['refs'])
             reply_from = r['from']
             reply_email = re.findall(
                 "([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)",
                 str(reply_from))
             reply['date'] = r['Date']
             reply['msg_id'] = r['Message-ID']
             reply_body = r.get_payload()
             if r.is_multipart():  # search for text in the body
                 for part in r.walk():
                     ctype = part.get_content_type()
                     cdispo = str(part.get('Content-Disposition'))
                     if ctype == ('text/plain' or 'text/html'
                                  ) and 'attachment' not in cdispo:
                         reply_body = part.get_payload()
                         #print(email_body)
                         break
             rep_from = BeautifulSoup(reply_from, 'html.parser')
             reply['email'] = reply_email[0]
             reply['from'] = rep_from.get_text()
             try:  # extra check for encryption (in case user has encypted email)
                 rep_body = BeautifulSoup(reply_body, 'html.parser')
                 reply['body'] = rep_body.get_text()
             except:  # if email is encrypted it will throw an exception
                 reply['body'] = encription_warning
             #print("Hello! I am found, ")
             replies_list.append(reply)
         return replies_list
     except:
         return False
         print("No more replies found.")
Пример #35
0
from googlesearch import search
from bs4 import BeautifulSoup
import requests
import urllib.request

for i,j in workdf.iterrows():
    if(len(workdf['Company'][i]))<=5:
        str_list = list('https://www.ifsccodebank.com/search-by-IFSC-code.aspx?IFSCCode=')
        link2 = workdf['Company'][i]
        print(link2,type(link2))
        str_list.append(link2)
        url = ''.join(str_list)
        print(url)
        response = requests.get(''.join(str_list))
        soup = BeautifulSoup(response.text,'html.parser')
        t = soup.get_text()
        r = t[t.find(link2)+7:]
        workdf['Title'][i] = (r[:r.find('-')-5].strip())
        for j in search(str(workdf['Title'][i]),stop = 1):
            workdf['Link'][i] = j
    else:
        workdf['Title'][i] = 'err'

workdf

"""### FOR non-acronym companies --- OBTAINING TITLE from LINK(column - WEBPAGE) TITLE"""

count = 0
for i,j in workdf.iterrows():
    if workdf['Title'][i]=='err' or workdf['Title'][i].startswith('ifsccode') or workdf['Title'][i]=='0':
        url = workdf['Link'][i]
Пример #36
0
def scrape(url):
    resp = requests.get(url)
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.text, 'html.parser')
        text = soup.get_text()
    return text
Пример #37
0
def Tell_Me_Alfred(query="The Himalayas are", answer_type="Description"):

    global ALL_RESULTS
    global ALL_ANSWERS_SORTED
    global ALL_ANSWERS
    ALL_RESULTS = []
    ALL_ANSWERS = dict()

    for url in search(query, stop=20):
        try:
            #print(url)
            ALL_RESULTS.append(url)
        except:
            print("URL Error")

    #ALL_RESULTS=['http://www.victoriamemorial-cal.org/', 'http://www.tripadvisor.in/Attraction_Review-g304558-d311680-Reviews-Victoria_Memorial_Hall-Kolkata_Calcutta_West_Bengal.html', 'http://kolkata.cityseekr.com/venue/403224-victoria-memorial', 'http://www.thecityguide.in/Kolkata/Art-Entertainment/SGGG/Victoria-Memorial-Elgin', 'http://www.justdial.com/Kolkata/Victoria-Memorial-Hall/033P6853927_BZDET', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#History', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Finance', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Design', 'http://en.wikipedia.org/wiki/Victoria_Memorial_(India)#Construction', 'http://ww.itimes.com/poll/best-image-of-victoria-memorial-kolkata-54ad5bd294fa2', 'http://www.trekearth.com/gallery/Asia/India/East/West_Bengal/Kolkata_(Calcutta)/photo1412050.htm', 'http://www.culturalindia.net/monuments/victoria-memorial.html', 'http://knowindia.gov.in/knowindia/culture_heritage.php?id=68', 'http://www.youtube.com/watch?v=C_0IvslcRqU', 'http://www.ixigo.com/victoria-memorial-kolkata-india-ne-1019165', 'http://www.lonelyplanet.com/india/kolkata-calcutta/sights/architecture/victoria-memorial', 'http://www.indianholiday.com/tourist-attraction/kolkata/victoria-memorial.html', 'http://www.mapsofindia.com/kolkata/places-of-interest/famous-monuments/victoria-memorial.html', 'https://www.facebook.com/pages/Victoria-Memorial-Hall-Kolkata/113100222172879', 'http://www.iloveindia.com/indian-monuments/victoria-memorial.html', 'http://www.kolkata.org.uk/tourist-attractions/victoria-memorial.html', 'http://www.vmsb.org/contact_us.html', 'http://mocomi.com/victoria-memorial-facts/', 'http://www.journeymart.com/de/india/west-bengal/kolkata/victoria-memorial.aspx', 'http://www.theincredibleindiatravel.com/victoria-memorial-hall-india/victoria-memorial.html', 'http://goindia.about.com/od/cities/ig/Kolkata-Photo-Gallery/Victoria-Memorial.htm', 'http://zeenews.india.com/news/sci-tech/victoria-memorial-museum-blackout-in-kolkata-for-earth-hour_1569445.html']
    #ALL_RESULTS=['http://en.wikipedia.org/wiki/Himalayas', 'http://en.wikipedia.org/wiki/Paro_Taktsang', 'http://en.wikipedia.org/wiki/List_of_Himalayan_peaks_and_passes', 'http://en.wikipedia.org/wiki/Indian_Himalayan_Region', 'http://en.wikipedia.org/wiki/Indian_Plate', 'http://simple.wikipedia.org/wiki/Himalayas', 'http://www.thehindu.com/sci-tech/energy-and-environment/emissions-from-biomass-burning-cross-the-himalayas/article7105899.ece', 'http://www.npr.org/blogs/goatsandsoda/2015/04/15/399579066/in-search-of-the-missing-trekkers-in-nepal-s-muddy-morass', 'http://www.nzherald.co.nz/bay-of-plenty-times/news/article.cfm?c_id=1503343&objectid=11434737', 'http://www.youtube.com/watch?v=HuSHOQ6gv5Y', 'http://www.britannica.com/EBchecked/topic/266037/Himalayas', 'http://www.english-online.at/geography/himalayas/himalaya-mountain-range.html', 'http://www.himalayanfootsteps.com/destinations/where-are-the-himalayas/', 'http://www.mountainprofessor.com/the-himalaya.html', 'http://www.himalaya2000.com/himalayan-facts/location-of-himalayas.html', 'http://www.unmuseum.org/yeti.htm', 'http://www.hitt-initiative.org/mla/?page_id=390', 'http://www.robinsonlibrary.com/geography/physical/mountains/himalaya.htm', 'http://geography.howstuffworks.com/asia/the-himalayas.htm', 'http://www.kidsdiscover.com/spotlight/himalayas-kids/', 'http://pubs.usgs.gov/gip/dynamic/himalaya.html', 'http://www.todayifoundout.com/index.php/2013/12/himalayas-formed/', 'http://www.pbs.org/wgbh/nova/everest/earth/birth.html', 'http://www.pbs.org/wnet/nature/the-himalayas-himalayas-facts/6341/', 'http://www.pbs.org/wnet/nature/the-himalayas-introduction/6338/', 'http://www.oddizzi.com/teachers/explore-the-world/physical-features/mountains/mountain-case-study/himalayas/', 'https://vimeo.com/121045965', 'http://www.worldwildlife.org/places/eastern-himalayas', 'http://www.answers.com/Q/What_are_the_Himalayas']

    print('YOUR TOP ANSWERS ARE:')
    c = 0.0
    for res in ALL_RESULTS:
        Exact_Match_Found_flag = 0
        try:
            timeout = 0
            #print 'Checking Source:',res
            response = urllib.request.urlopen(res)
            page_data = response.read()
            page_data = BeautifulSoup(page_data)
            page_data = page_data.get_text()
            page_data = page_data.split('.')

            # Read from Individual Web Pages
            if answer_type == "Description":
                Start_T = time.time()
                for line in page_data:
                    Curr_T = time.time()
                    if (Curr_T - Start_T) > 15.0:
                        break
                    if re.findall(query.lower(), line.lower()) != []:
                        c += 1.0
                        line_low = line.lower()
                        line = line_low.split(query.lower())
                        print(
                            '==============================================================================='
                        )
                        print('Answer ', c, ':')
                        line = query + line[1] + '.'
                        print(line)
                        print('\n\nSource: ', res)
                        print(
                            '==============================================================================='
                        )
                        Exact_Match_Found_flag = 1
                        break

            elif answer_type == "Location":
                query_parts = query.split(' ')
                Start_T = time.time()
                for line in page_data:
                    Curr_T = time.time()
                    if (Curr_T - Start_T) > 30.0:
                        break
                    check_next = 0
                    for each_qp in query_parts:
                        if re.findall(each_qp.lower(), line.lower()) == []:
                            check_next = 1
                            break
                    if check_next == 1:
                        continue
                    else:
                        line_parts = line.split(' ')
                        for each_lp in line_parts:
                            if (each_lp in query_parts) or (
                                    each_lp
                                    in IGNORE_LIST):  #Skip the Query Words
                                continue
                            if check_WordNet(
                                    word=each_lp,
                                    def_word='city') or check_WordNet(
                                        word=each_lp,
                                        def_word='country') or check_WordNet(
                                            word=each_lp, def_word='continent'
                                        ) or check_WordNet(word=each_lp,
                                                           def_word='state'):
                                c += 1.0
                                print(each_lp)
                                if each_lp not in ALL_ANSWERS:
                                    ALL_ANSWERS[each_lp] = 1
                                else:
                                    ALL_ANSWERS[each_lp] += 1
                                Exact_Match_Found_flag = 1
                                break
                        if Exact_Match_Found_flag:
                            break

            #print 'Finished Checking Source:',res
        except:
            print()

    #Give a Probability for One Word Answers
    if answer_type == "Location":

        ALL_ANSWERS_SORTED = []
        all_ans = list(ALL_ANSWERS.keys())
        for each_ans in all_ans:
            ALL_ANSWERS_SORTED.append([ALL_ANSWERS[each_ans], each_ans])

        ALL_ANSWERS_SORTED.sort()
        print(
            '==============================================================================='
        )
        print('SUMMARY:')
        print(
            '---------------------------------------------------------------------------'
        )
        for each_sa in range(0, len(ALL_ANSWERS_SORTED)):
            idx = len(ALL_ANSWERS_SORTED) - 1 - each_sa
            print(ALL_ANSWERS_SORTED[idx][1])
            print('Confidence Measure= ',
                  (ALL_ANSWERS_SORTED[idx][0] / c * 100.0), '%')
            print(
                '---------------------------------------------------------------------------'
            )
        print(
            '==============================================================================='
        )
Пример #38
0
def ProcessPage(options, mycursor, languages, mtProc, statusCode,
                orig_encoding, htmlText, pageURL, crawlDate, languagesClass):
    print("page", pageURL)
    if pageURL == "unknown":
        logging.info("Unknown page url")
        return

    if orig_encoding == None:
        logging.info("Encoding of document " + pageURL +
                     " could not be identified")

    if len(htmlText) == 0:
        logging.info("Empty page")
        return

    # lang id
    # printable_str = ''.join(x for x in cleantree if x in string.printable)
    logging.info(pageURL + ": detecting language")
    success, lang = guess_lang_from_data2(htmlText)
    if success:
        langId = languagesClass.GetOrSaveLang(lang)
    else:
        return

    logging.info(pageURL + ": Getting text with BeautifulSoup")
    soup = BeautifulSoup(htmlText, features='html5lib')  # lxml html.parser
    for script in soup(["script", "style", "img"]):
        script.extract()  # rip it out

    plaintext = soup.get_text()

    if len(plaintext) > 0:
        # Guessing MIME of the file (checked on original content)
        logging.info(pageURL + ": Getting mime")
        mime = magic.from_buffer(htmlText, mime=True)
        # mimeFile.write(mime.encode() + b"\n")

        c = hashlib.md5()
        c.update(htmlText.encode())
        hashDoc = c.hexdigest()

        pageURLId = SaveURL(mycursor, pageURL)
        docId = SaveDoc(mycursor, crawlDate, statusCode, pageURLId, langId,
                        mime, hashDoc)
        # print("docId", docId)

        # links
        SaveLinks(mycursor, languages, mtProc, soup, pageURL, docId,
                  languagesClass)

        # write html and text files
        filePrefix = options.outDir + "/" + str(docId)

        with lzma.open(filePrefix + ".html.xz", "wt") as htmlFile:
            htmlFile.write(htmlText)
        with lzma.open(filePrefix + ".text.xz", "wt") as textFile:
            textFile.write(plaintext)

        # print("plaintext", len(plaintext))
        splitterCmd = "{bitextorRoot}/preprocess/moses/ems/support/split-sentences.perl -b -l {lang1}".format(
            bitextorRoot=bitextorRoot, lang1=lang)
        extractedLines = split_sentences(plaintext, splitterCmd,
                                         options.prune_type,
                                         options.prune_threshold)

        if os.path.exists(options.outDir):
            if not os.path.isdir(options.outDir):
                sys.stderr.write("Must be a directory: " + options.outDir)
        else:
            os.mkdir(options.outDir)

        # write splitted file
        extractPath = options.outDir + "/" + str(
            docId) + "." + lang + ".extracted.xz"
        with lzma.open(extractPath, 'wt') as extractFile:
            for extractedLine in extractedLines:
                extractFile.write(str(docId) + "\t" + extractedLine + "\n")

        if lang != languages[-1]:
            # translate
            transPath = options.outDir + "/" + str(docId) + ".trans.xz"
            transFile = lzma.open(transPath, 'wt')

            for inLine in extractedLines:
                pass
                # print("inLine", inLine)
                #inLine += "\n"
                #mtProc.stdin.write(inLine.encode('utf-8'))
                #mtProc.stdin.flush()
                #outLine = mtProc.stdout.readline()
                #outLine = outLine.decode("utf-8")
                #transFile.write(str(docId) + "\t" + outLine)

            transFile.close()
Пример #39
0
import requests
from bs4 import BeautifulSoup

WIKI_URL = 'https://ia601405.us.archive.org/18/items/alicesadventures19033gut/19033.txt'
req = requests.get(WIKI_URL)
soup = BeautifulSoup(req.text, 'html5lib')
file = soup.get_text()
testo = ''.join([x for x in file if x in string.ascii_letters + ' ' + '-' ])
word_counts = {}


parole = testo.strip().split(' ')

for value in parole:
    key = value.translate(str.maketrans('','',string.punctuation)).lower()
    if key in word_counts.keys():
        word_counts[key] += 1
    else:
        word_counts[key] = 1

print(word_counts) 
Пример #40
0
    <li><a href="/scholarships-for-veterans" id="scholarship">
    <li><a href="http://www.simplelearn.com/feed/" id="rss">RSS FEED</a></li>


</ul>"""

#<!--Create soup object-->
soup_SL = BeautifulSoup(data_SL, 'html.parser')

#if i do this get all info
print(soup_SL)

#parse only part of the document,text values for tags using getText method

print('_______________________Get only req___________________')
print(soup_SL.get_text())

#import soupstrainer class for parsing the desied part of the web document
from bs4 import SoupStrainer

#create object to parse only the id (link) with lab
tags_with_LabLink = SoupStrainer(id='lab')

#print the part of the parsed document

print(BeautifulSoup(data_SL, 'html.parser', parse_only=tags_with_LabLink))

print('--------------------------')

print(
    BeautifulSoup(data_SL, 'html.parser',
Пример #41
0
r = requests.get(url)

# Extract the response as html: html_doc
html_doc = r.text

# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)

# Get the title of Guido's webpage: guido_title
guido_title = (soup.title)

# Print the title of Guido's webpage to the shell
print(guido_title)

# Get Guido's text: guido_text
guido_text = soup.get_text()

# Print Guido's text to the shell
print(guido_text)

# Import packages
import requests
from bs4 import BeautifulSoup

# Specify url
url = 'https://www.python.org/~guido/'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Extracts the response as html: html_doc
Пример #42
0
 def get_html_node_text(self, html):
     soup = BeautifulSoup(html, "lxml")
     return str(soup.get_text())
Пример #43
0
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()