Exemplo n.º 1
0
 def test_categorize_relative_urls_provided(self):
     p = LinkParser()
     p.feed(base_url='http://feeds.huffingtonpost.com',
            html='''
         <link rel="alternate" type="application/rss+xml" title="The Full Feed" href="huffingtonpost/raw_feed" />
     ''')
     self.assertEqual(p.find_base_url(), 'http://feeds.huffingtonpost.com')
     self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])
Exemplo n.º 2
0
    def xtest_500plus_links(self):
        p = LinkParser()

        input_html = read_file('01_input.html')
        p.feed(input_html, timeout=60)
        output_json = read_file('01_output.json')
        data = json.loads(output_json)

        self.assertSetEqual(set(data[RSS_KEY]), set(p.data[RSS_KEY]))
        self.assertSetEqual(set(data[ATOM_KEY]), set(p.data[ATOM_KEY]))
Exemplo n.º 3
0
	def __crawlPage(self, pageName):
		fullPageName = pageName
		# If the page is a url go directly there otherwise prepend the domain
		if(pageName.find('://')!=-1):
			page = self.__getPage(pageName)
		else:
			page = self.__getPage(self.domain+pageName)
		# Some link urls will be in the format /page1/page2
		# we remove this to avoid http://site//page1/page2
		if pageName.startswith('/'):
			pageName = pageName[1:]
		parser = LinkParser(self.domain)
		parser.feed(page)
		pageLinks = parser.getLinks()
		self.discovered = self.discovered.union(pageLinks)
		# Convert links to list for later json serialisation
		self.map.append({'page': fullPageName, 'links': list(pageLinks)})	
    def crawlAllUrl(self,outputFlag = False,crawlAmountLimit = CRAWL_AMOUNT_LIMIT): 
        while len(Crawler.urlList)>0:
            Crawler.urlRecordLock.acquire()#lock the queue when loading the first element
            url = Crawler.urlList.pop()
            pathname = self.url2Pathname(url)
            Crawler.urlNotDone.pop(pathname)
            
            if Crawler.crawledAmount >= crawlAmountLimit:
                Crawler.urlRecordLock.release()
                break
            Crawler.urlRecordLock.release()
            
            result = self.crawlUrl(NORMAL_SITE,url,outputFlag)
            try:
                urlArr = urlparse.urlparse(url)
                #if can not crawl the url, accumulate to the errorCounter
                if result == False:
                    Crawler.urlRecordLock.acquire()
                    if Crawler.errorCounter.has_key(urlArr.netloc):
                        Crawler.errorCounter[urlArr.netloc]+=1
                    else:
                        Crawler.errorCounter[urlArr.netloc] = 1
                    Crawler.urlRecordLock.release()
                    continue
                    if Crawler.errorCounter[urlArr.netloc]> MIN_ERRORS_ALLOWED_FOR_A_SITE:
                        continue
                _path = urlArr.path
                rightMostSlashIndex = _path.rfind('/')
                replaced = _path[rightMostSlashIndex : len(_path)]
                #try to parse relative address
                if replaced.find('.') != -1:
                    _path = _path.replace(replaced,'')
                hostPath = urlArr.scheme + '://' + urlArr.netloc + _path 
                 
                parser = LinkParser()
                parser.setFlag(NORMAL_SITE)
                parser.setHostPath(hostPath)
                parser.feed(result)
                urlList = parser.hrefsList
                    
                Crawler.urlRecordLock.acquire()
                self.addUrlList(urlList)
                Crawler.crawledAmount += 1
                Crawler.urlRecordLock.release()

                parser.close()
                    
            except Exception, e:
                #print(e)
                self.reportError(url, msg[ERROR_HTML_PARSE])
Exemplo n.º 5
0
def request_data(endpoint, request, timeout):

    next_link = endpoint + request

    token = get_auth_token()

    headers = {'Authorization': token}

    response = []

    while next_link != '':

        try:
            r = requests.get(next_link, headers=headers, timeout=(3, timeout))
        except requests.exceptions.Timeout as e:
            logger.warn('Request %s timed out after %f seconds.', next_link,
                        timeout)
            return [598, response]
        except requests.exceptions.ConnectionError as e:
            logger.error('Caught %s', e.message)
            app_config.request_connection_failure = True
            raise

        app_config.request_connection_failure = False

        if r.status_code == 200:

            response.append(r.text)

            if 'link' in r.headers:
                link_string = r.headers['link']
                lp = LinkParser(link_string)
                next_link = lp.get_link('next')
            else:
                next_link = ''

        else:
            logger.warn('Failed request with status code %d', r.status_code)
            return [r.status_code, response]

    return [200, response]
Exemplo n.º 6
0
 def test_multiple_runs(self):
     p = LinkParser()
     p.feed('''
         <link rel="canonical" href="http://feeds.huffingtonpost.com" />
         <link rel="alternate" type="application/rss+xml" title="The Full Feed" href="huffingtonpost/raw_feed" />
     ''')
     p.feed(
         '<link rel="alternate" type="application/atom+xml" href="http://feeds.feedburner.com/PTCC" />'
     )
     self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])
     self.assertListEqual([SAMPLE_ATOM], p.data[ATOM_KEY])
Exemplo n.º 7
0
 def test_all_links(self):
     link = '<https://api.github.com/organizations/913567/repos?page=3>; rel="next", <https://api.github.com/organizations/913567/repos?page=5>; rel="last", <https://api.github.com/organizations/913567/repos?page=1>; rel="first", <https://api.github.com/organizations/913567/repos?page=1>; rel="prev"'
     lp = LinkParser(link)
     self.assertEqual(
         lp.get_link('next'),
         'https://api.github.com/organizations/913567/repos?page=3')
     self.assertEqual(
         lp.get_link('last'),
         'https://api.github.com/organizations/913567/repos?page=5')
     self.assertEqual(
         lp.get_link('prev'),
         'https://api.github.com/organizations/913567/repos?page=1')
     self.assertEqual(
         lp.get_link('first'),
         'https://api.github.com/organizations/913567/repos?page=1')
Exemplo n.º 8
0
def main():
    initResult = init.initGlobal()
    crawler = Crawler()
    if(initResult != False):
        #input
        print("Please enter your keyword")
        keyword = raw_input()
        keyword = keyword.replace(' ','+')
        
        #start crawling from search engine
        crawler = Crawler()
        startTime = time.time()
        crawler.loadRecord(LOG_OF_CRAWLED_URL)
        crawler.loadRecord(LOG_OF_CRAWLED_CONTENT)
        crawler.addSearchEngineUrl(keyword)
        htmlcode = crawler.crawlUrl(GOOGLE)
        parser = LinkParser()
        parser.setFlag(GOOGLE)
        parser.feed(htmlcode)
        top10 = parser.hrefsList
        crawler.addUrlList(top10,GOOGLE)

        parser.close()
        threadPool = []
        #   run the work with THREAD_NUM threads
        while len(threadPool) <= THREAD_NUM:
            th = threading.Thread(None,crawl)
            threadPool.append(th)
            
        for item in threadPool:
            item.start()  
        for item in threadPool:
            item.join()
              
        crawler.flush()
        endTime = time.time()
        print("time used:")
        print(endTime-startTime)
        keyword = raw_input()
Exemplo n.º 9
0
 def test_empty_string(self):
     lp = LinkParser("")
     self.assertEqual(lp.get_link('next'), '')
     self.assertEqual(lp.get_link('prev'), '')
     self.assertEqual(lp.get_link('last'), '')
     self.assertEqual(lp.get_link('first'), '')
Exemplo n.º 10
0
 def test_mal_formed_link_missin_equal(self):
     link = '<https://api.github.com/organizations/913567/repos?page=3>; rel~"next"'
     lp = LinkParser(link)
     self.assertEqual(lp.get_link('next'), '')
Exemplo n.º 11
0
 def test_mal_formed_link_missing_close_angle_bracket(self):
     link = '<https://api.github.com/organizations/913567/repos?page=3; rel="next"'
     lp = LinkParser(link)
     self.assertEqual(lp.get_link('next'), '')
 def load_link_parser(self, path):
     """loads link parser"""
     self.lparser = LinkParser(path=path)
class FeatureLoader:
    """loads features for sentences"""
    def __init__(self):
        """init dictionary"""
        # note that this dict recognizes more british spellings than en_US but not all
        self.dict = enchant.Dict('en')

    def load_link_parser(self, path):
        """loads link parser"""
        self.lparser = LinkParser(path=path)

    def process_line(self, line):
        """returns feature dict for line"""
        line = line.strip()
        tokens = line.lower().split()
        features = {'length': len(tokens)}
        if len(line) == 0:
            return features
        try:
            features.update(self.feats_spelling(tokens))
        except:
            print >> sys.stderr, 'Error extracting spelling feats'
            print >> sys.stderr, traceback.format_exc()
        try:
            features.update(self.feats_link(' '.join(tokens)))
        except:
            print >> sys.stderr, 'Error extracting link feats'
            print >> sys.stderr, traceback.format_exc()
        try:
            features.update(self.feats_ngram_lm(tokens))
        except:
            print >> sys.stderr, 'Error extracting ngram/lm feats'
            print >> sys.stderr, traceback.format_exc()
        return features

    def process_file(self, fpath):
        """iterate through file and extract features by line"""
        all_features = []
        with open(fpath) as f:
            for i, l in enumerate(f):
                if i % 100 == 0:
                    print i
                all_features.append(self.process_line(l))

    def feats_spelling(self, tokens):
        """get spelling features"""
        n = 0
        miss = 0
        for s in tokens:
            if s.isalpha():
                n += 1
                if not self.dict.check(s):
                    miss += 1
        return {
            'num_miss': miss,
            'prop_miss': 1.0 * miss / max(1, n),
            'log_miss': log(miss + 1)
        }

    def load_lms(self, gpath, tpath):
        """load language models"""
        self.gigalm = kenlm.LanguageModel(gpath)
        self.toefllm = kenlm.LanguageModel(tpath)

    def get_ngram_prob(self, tokens):
        """get smoothed ngram prob from gigaword LM"""
        return self.gigalm.score(' '.join(tokens), bos=False, eos=False)

    def get_sent_prob(self, tokens, lm):
        oovs = 0
        score = 0
        for s in lm.full_scores(' '.join(tokens)):
            if s[2]:
                oovs += 1
            score += s[0]
        return oovs, score

    def feats_ngram_lm(self, tokens):
        """extract ngram and lm features"""
        features = {}
        for n in range(1, 4):
            if n > len(tokens):
                continue
            ngrams = Counter(
                [tuple(tokens[i:i + n]) for i in xrange(len(tokens) + 1 - n)])
            probabilities = [self.get_ngram_prob(ng) for ng in ngrams]
            features['min_s_%d' % n] = min(probabilities)
            features['max_s_%d' % n] = max(probabilities)
            features['sum_s_%d' %
                     n] = sum(probabilities) / sum(ngrams.values())
        features['giga_oov'], features['giga_p'] = self.get_sent_prob(
            tokens, self.gigalm)
        features['toefl11_oov'], features['toefl11_p'] = self.get_sent_prob(
            tokens, self.toefllm)
        return features

    def feats_link(self, l):
        """extract link parser feature"""
        return {'complete_link': self.lparser.has_parse(l)}

    def get_next_block(self, infile):
        """return the next block of lines in a file until a blank line is read
        specifically for dealing with the parser output"""
        lines = None
        while True:
            l = infile.readline().strip()
            if l.strip() == '(())':
                raise TooLongError('Sentence too long to parse')
            if not l or len(l) == 0:
                return lines
            if lines is None:
                lines = []
            lines.append(l)
        return lines

    def load_parse_features(self, f):
        """read the stanford parser output to get parse features"""
        ret = []
        with open(f) as infile:
            while True:
                try:
                    next_parse = self.get_next_block(infile)
                except TooLongError:
                    ret.append(None)
                    continue
                if not next_parse:
                    break
                if next_parse[0].startswith('#'):
                    features = {}
                    features['parse'] = next_parse[1]
                    if next_parse[0].endswith('NA'):
                        continue
                    features['parse_score'] = float(next_parse[0].split()[-1])
                    features['sentential_top_node'] = next_parse[1].split(
                    )[1][1] == 'S'
                    features['dep_count'] = sum([
                        1 if l.startswith('dep') else 0 for l in next_parse[2:]
                    ])
                    ret.append(features)
        return ret

    def load_hpsg_features(self, fpath):
        """given a path to the output of ./cheap, return a list of dictionaries that contain the
        specified features of each sentence"""
        features = None
        for line in open(fpath):
            if ':' not in line:
                continue
            key, value = line.split(':', 1)
            if key == 'id':
                # marks a new sentence
                if features:
                    yield (features)
                features = {}
            elif key in HPSG_FEATURES:
                features[key] = log(value + 1)
        yield (features)
Exemplo n.º 14
0
 def test_empty_href(self):
     p = LinkParser()
     p.feed('<a href>test</a><link href><a href="' + SAMPLE_RSS + '"></a>')
     self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])