예제 #1
1
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from spynner import Browser
from pyquery import PyQuery
import difflib
import re

print '* starting browser...'
browser = Browser()
browser.set_html_parser(PyQuery)

class NoMatchException(Exception): pass

def match_score(a, b):
    shorter = min(len(a), len(b))
    match = difflib.SequenceMatcher(lambda x: x==' ', a.lower(), b.lower())
    score1 = match.ratio()
    score2 = float(match.find_longest_match(0,len(a),0,len(b)).size) / shorter
    return (score1 + score2) / 2

def get_info_from_title(title):
    print '[%s]' % title
    data = {}
    
    # navigate to scopus
    print '* navigating to Scopus...'
    browser.load('http://www.scopus.com/')

    # fill in search info
    print '* filling search...'
예제 #2
0
    def __init__(self, source_lang, target_lang, translate_type='headless'):
        self.source_lang = source_lang
        self.target_lang = target_lang

        # instantiate the relevant browser for the instance
        self.headless_browser = self.driver = None
        if translate_type == 'headless':
            self.headless_browser = Browser()
            self.headless_browser.set_html_parser(PyQuery)
        elif translate_type == 'selenium':
            self.driver = webdriver.Firefox()

        # determine the kind of translator needed
        self.translate = {
            'simple': self.translate_simple,
            'headless': self.translate_text_google_headless,
            'selenium': self.translate_text_google
        }[translate_type]
예제 #3
0
    def process_request(self, request, spider):
        browser = Browser()
        browser.create_webview()
        browser.set_html_parser(PyQuery)
        browser.load(request.url, 20)

        try:
            browser.wait_load(10)
        except:
            print '###########ERROR###########'
            pass

        body = browser.html

        body = body.encode('utf-8')

        renderedBody = str(body)

        return HtmlResponse(request.url, body=renderedBody)
예제 #4
0
def get_info(id):
    data = {'username': '******', 'password': '******'}
    url = 'http://informatics.mccme.ru/login/index.php'
    #browser = webdriver.Chrome('/Users/mihail/Desktop/chromedriver')

    #browser.get(url)
    #request_cookies_browser = browser.get_cookies()
    #s = requests.Session()
    #c = [s.cookies.set(c['name'], c['value']) for c in request_cookies_browser]
    #resp = s.post(url, data)
    #dict_resp_cookies = resp.cookies.get_dict()
    #response_cookies_browser = [{'name': name, 'value': value} for name, value in dict_resp_cookies.items()]
    #c = [browser.add_cookie(c) for c in response_cookies_browser]
    #browser.get(url)
    #html = browser.page_source
    #browser.get('http://informatics.mccme.ru/submits/view.php?user_id=' + id)
    #WebDriverWait(browser, 10).until(
    #    ajax_complete, "Timeout waiting for page to load")
    #html = browser.page_source

    browser = Browser()
    browser.load('http://informatics.mccme.ru/submits/view.php?user_id=' + id)
    browser.wait_load()
    html = browser.html

    print(html)
예제 #5
0
    def process_request(self, request, spider):
        browser = Browser()
        browser.create_webview()
        browser.set_html_parser(PyQuery)
        browser.load(request.url, 20)

        try:
            browser.wait_load(10)
        except:
            print '###########ERROR###########'
            pass

        body = browser.html

        body = body.encode('utf-8')

        renderedBody = str(body)

        return HtmlResponse(request.url, body=renderedBody)
    def __init__(self, source_lang, target_lang, translate_type='headless'):
        self.source_lang = source_lang
        self.target_lang = target_lang

        # instantiate the relevant browser for the instance
        self.headless_browser = self.driver = None
        if translate_type == 'headless':
            self.headless_browser = Browser()
            self.headless_browser.set_html_parser(PyQuery)
        elif translate_type == 'selenium':
            self.driver = webdriver.Firefox()

        # determine the kind of translator needed
        self.translate = {
            'simple': self.translate_simple,
            'headless': self.translate_text_google_headless,
            'selenium': self.translate_text_google
        }[translate_type]
class FileTranslator(object):

    def __init__(self, source_lang, target_lang, translate_type='headless'):
        self.source_lang = source_lang
        self.target_lang = target_lang

        # instantiate the relevant browser for the instance
        self.headless_browser = self.driver = None
        if translate_type == 'headless':
            self.headless_browser = Browser()
            self.headless_browser.set_html_parser(PyQuery)
        elif translate_type == 'selenium':
            self.driver = webdriver.Firefox()

        # determine the kind of translator needed
        self.translate = {
            'simple': self.translate_simple,
            'headless': self.translate_text_google_headless,
            'selenium': self.translate_text_google
        }[translate_type]

    def _forcefully_kill_firefox(self):
        """ This is needed in order to kill Firefox when it gets stuck.. """
        print "Killing Firefox forcefully..."
        os.system('taskkill /im firefox.exe /f /t')

    def translate_text_google(self, text_to_translate, quit_browser=True):
        """
            Uses selenium to translate the text. kinda slow, and gets stuck..
        """
        # Open Google Translate website
        url = "http://translate.google.com/#%s/%s/%s" % (self.source_lang.to_google_translate(),
                                                         self.target_lang.to_google_translate(),
                                                         text_to_translate)
        self.driver.get(url)

        # Wait for results to appear and retrieve them
        # If results don't show up in 11 seconds, it means that Firefox stuck, kill it and continue
        t = Timer(11.0, self._forcefully_kill_firefox)
        t.start()
        WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, "//span[@id='result_box']/span[@class='hps']")))
        t.cancel()

        result = self.driver.find_element_by_id('result_box')
        translated_text = result.text
        self.driver.get('http://www.google.com')
        return translated_text

    def translate_text_google_headless(self, text_to_translate):
        """
            Uses spynner - a headless browser, to translate the text.
            This works really fast...
        """
        url = "https://translate.google.com/#%s/%s/%s" % (self.source_lang.to_google_translate(),
                                                          self.target_lang.to_google_translate(),
                                                          text_to_translate)
        self.headless_browser.load(url)
        result_box = self.headless_browser.soup('#result_box')[0]
        translated_text = result_box.text_content()
        self.headless_browser.load('http://www.example.com/')
        return translated_text

    def translate_simple(self, text_to_translate):
        """
            Uses simple urllib to translte.
            It doesn't really work on more than 2000 characters for some reason (maybe a limit posed by google)
        """
        '''Return the translation using google translate
        you must shortcut the langage you define (French = fr, English = en, Spanish = es, etc...)
        if you don't define anything it will detect it or use english by default
        Example:
        print(translate("salut tu vas bien?", "en"))
        hello you alright?'''
        agents = {'User-Agent':"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)"}
        before_trans = 'class="t0">'
        link = "http://translate.google.com/m?hl=%s&sl=%s&q=%s" % (self.target_lang.to_google_translate(),
                                                                   self.source_lang.to_google_translate(),
                                                                   text_to_translate.replace(" ", "+"))
        request = urllib2.Request(link, headers=agents)
        page = urllib2.urlopen(request).read()
        result = page[page.find(before_trans)+len(before_trans):]
        result = result.split("<")[0]
        return result

    def translate_text_spanishenglish(self, text_to_translate):
        """
            An attempt to use spynner to translate through spanishenglish.com
            (instead of google.. in case google will block us or something..)
            The problem is that for some reason spanishenglish.com doesn't work with the
            spynner core (it does work well with chrome though)... so it's stuck now...
        """
        pass
        # url = "http://www.spanishenglish.com/"
        # self.headless_browser.load(url)
        # print "LOADED"
        # self.headless_browser.click("a[href='#en']", wait_load=False)
        # self.headless_browser.click("#LangPair_ToDDL tbody tr td a[href='#de']", wait_load=False)
        # self.headless_browser.wk_fill("#InputText", "How are you doing today?")
        # self.headless_browser.click_ajax("#TranslateButton")
        #
        # import time
        # #time.sleep(15)
        # result_box = self.headless_browser.soup('#OutputTextHtmlCell')[0]
        # translated_text = result_box.text_content()
        # return translated_text
        #
        # # self.headless_browser.show()
        # # time.sleep(30)

    def translate_file(self, path):
        with open(path) as f:
            json = f.read()
            d = ujson.loads(json)
            text = d['text']

            return self.translate(text)

    def translate_to_file(self, source_path, target_path):
        try:
            translated_text = self.translate_file(source_path)
        except httplib.CannotSendRequest:
            self.driver = webdriver.Firefox()
            return
        except Exception, e:
            print "Failed to translate: {:s}".format(source_path)
            print '='*60
            import traceback
            print traceback.format_exc()
            return

        with codecs.open(target_path, 'w', 'utf-8') as f:
            f.write(translated_text)
예제 #8
0
#
# app = QApplication(sys.argv)
# web = QWebView()
# web.load(QUrl(url))
# f = lambda x: x
# # QObject.connect(web, SIGNAL("loadFinished"), f(4))
#

# import mechanize
# url = "http://www.spanishenglish.com/"
# browser = mechanize.Browser()
# browser.set_handle_robots(False)
# browser.open(url)

from spynner import Browser
browser = Browser()
from pyquery import PyQuery
browser.set_html_parser(PyQuery)
url = "http://www.spanishenglish.com/"
browser.load(url)
print "LOADED"
browser.click("a[href='#en']", wait_load=False)
browser.click("#LangPair_ToDDL tbody tr td a[href='#de']", wait_load=False)
browser.wk_fill("#InputText", "How are you doing today?")
browser.click_ajax("#TranslateButton")
import time
time.sleep(15)
result_box = browser.soup('#TranslationOutput')[0]
translated_text = result_box.text_content()
# browser.show()
time.sleep(30)
예제 #9
0
# web.load(QUrl(url))
# f = lambda x: x
# # QObject.connect(web, SIGNAL("loadFinished"), f(4))
#



# import mechanize
# url = "http://www.spanishenglish.com/"
# browser = mechanize.Browser()
# browser.set_handle_robots(False)
# browser.open(url)


from spynner import Browser
browser = Browser()
from pyquery import PyQuery
browser.set_html_parser(PyQuery)
url = "http://www.spanishenglish.com/"
browser.load(url)
print "LOADED"
browser.click("a[href='#en']", wait_load=False)
browser.click("#LangPair_ToDDL tbody tr td a[href='#de']", wait_load=False)
browser.wk_fill("#InputText", "How are you doing today?")
browser.click_ajax("#TranslateButton")
import time
time.sleep(15)
result_box = browser.soup('#TranslationOutput')[0]
translated_text = result_box.text_content()
# browser.show()
time.sleep(30)
예제 #10
0
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from spynner import Browser
from pyquery import PyQuery
import difflib
import re

print '* starting browser...'
browser = Browser()
browser.set_html_parser(PyQuery)


class NoMatchException(Exception):
    pass


def match_score(a, b):
    shorter = min(len(a), len(b))
    match = difflib.SequenceMatcher(lambda x: x == ' ', a.lower(), b.lower())
    score1 = match.ratio()
    score2 = float(match.find_longest_match(0, len(a), 0,
                                            len(b)).size) / shorter
    return (score1 + score2) / 2


def get_info_from_title(title):
    print '[%s]' % title
    data = {}

    # navigate to scopus
예제 #11
0
class FileTranslator(object):
    def __init__(self, source_lang, target_lang, translate_type='headless'):
        self.source_lang = source_lang
        self.target_lang = target_lang

        # instantiate the relevant browser for the instance
        self.headless_browser = self.driver = None
        if translate_type == 'headless':
            self.headless_browser = Browser()
            self.headless_browser.set_html_parser(PyQuery)
        elif translate_type == 'selenium':
            self.driver = webdriver.Firefox()

        # determine the kind of translator needed
        self.translate = {
            'simple': self.translate_simple,
            'headless': self.translate_text_google_headless,
            'selenium': self.translate_text_google
        }[translate_type]

    def _forcefully_kill_firefox(self):
        """ This is needed in order to kill Firefox when it gets stuck.. """
        print "Killing Firefox forcefully..."
        os.system('taskkill /im firefox.exe /f /t')

    def translate_text_google(self, text_to_translate, quit_browser=True):
        """
            Uses selenium to translate the text. kinda slow, and gets stuck..
        """
        # Open Google Translate website
        url = "http://translate.google.com/#%s/%s/%s" % (
            self.source_lang.to_google_translate(),
            self.target_lang.to_google_translate(), text_to_translate)
        self.driver.get(url)

        # Wait for results to appear and retrieve them
        # If results don't show up in 11 seconds, it means that Firefox stuck, kill it and continue
        t = Timer(11.0, self._forcefully_kill_firefox)
        t.start()
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, "//span[@id='result_box']/span[@class='hps']")))
        t.cancel()

        result = self.driver.find_element_by_id('result_box')
        translated_text = result.text
        self.driver.get('http://www.google.com')
        return translated_text

    def translate_text_google_headless(self, text_to_translate):
        """
            Uses spynner - a headless browser, to translate the text.
            This works really fast...
        """
        url = "https://translate.google.com/#%s/%s/%s" % (
            self.source_lang.to_google_translate(),
            self.target_lang.to_google_translate(), text_to_translate)
        self.headless_browser.load(url)
        result_box = self.headless_browser.soup('#result_box')[0]
        translated_text = result_box.text_content()
        self.headless_browser.load('http://www.example.com/')
        return translated_text

    def translate_simple(self, text_to_translate):
        """
            Uses simple urllib to translte.
            It doesn't really work on more than 2000 characters for some reason (maybe a limit posed by google)
        """
        '''Return the translation using google translate
        you must shortcut the langage you define (French = fr, English = en, Spanish = es, etc...)
        if you don't define anything it will detect it or use english by default
        Example:
        print(translate("salut tu vas bien?", "en"))
        hello you alright?'''
        agents = {
            'User-Agent':
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)"
        }
        before_trans = 'class="t0">'
        link = "http://translate.google.com/m?hl=%s&sl=%s&q=%s" % (
            self.target_lang.to_google_translate(),
            self.source_lang.to_google_translate(),
            text_to_translate.replace(" ", "+"))
        request = urllib2.Request(link, headers=agents)
        page = urllib2.urlopen(request).read()
        result = page[page.find(before_trans) + len(before_trans):]
        result = result.split("<")[0]
        return result

    def translate_text_spanishenglish(self, text_to_translate):
        """
            An attempt to use spynner to translate through spanishenglish.com
            (instead of google.. in case google will block us or something..)
            The problem is that for some reason spanishenglish.com doesn't work with the
            spynner core (it does work well with chrome though)... so it's stuck now...
        """
        pass
        # url = "http://www.spanishenglish.com/"
        # self.headless_browser.load(url)
        # print "LOADED"
        # self.headless_browser.click("a[href='#en']", wait_load=False)
        # self.headless_browser.click("#LangPair_ToDDL tbody tr td a[href='#de']", wait_load=False)
        # self.headless_browser.wk_fill("#InputText", "How are you doing today?")
        # self.headless_browser.click_ajax("#TranslateButton")
        #
        # import time
        # #time.sleep(15)
        # result_box = self.headless_browser.soup('#OutputTextHtmlCell')[0]
        # translated_text = result_box.text_content()
        # return translated_text
        #
        # # self.headless_browser.show()
        # # time.sleep(30)

    def translate_file(self, path):
        with open(path) as f:
            json = f.read()
            d = simplejson.loads(json)
            text = d['text']

            return self.translate(text)

    def translate_to_file(self, source_path, target_path):
        try:
            translated_text = self.translate_file(source_path)
        except httplib.CannotSendRequest:
            self.driver = webdriver.Firefox()
            return
        except Exception, e:
            print "Failed to translate: {:s}".format(source_path)
            print '=' * 60
            import traceback
            print traceback.format_exc()
            return

        with codecs.open(target_path, 'w', 'utf-8') as f:
            f.write(translated_text)