Exemplo n.º 1
0
def main():
    paragraph_url = sys.argv[1]
    question_path = sys.argv[2]
    paragraph_path = sys.argv[3]
    ws = WebScraper(paragraph_url)
    ws.saveBody(paragraph_path)

    os.system(
        "python Test_Batch.py --paragraph " + paragraph_path + " --question " +
        question_path +
        " --model pytorch_model.bin --config_file Results/bert_config.json")
 def test_get_detail_links(self):
     with open('test_data/expected_links.txt', 'r') as f:
         expected = [line.rstrip() for line in f]
     with open('test_data/search_results.html', 'r') as f:
         page_source = f.read()
     actual = WebScraper.get_detail_links(page_source)
     assert actual == expected
Exemplo n.º 3
0
class BookWriter:
    def __init__(self, fileName: str, domain: str, userid: str,
                 authtoken: str):
        self.w = WebScraper(domain, userid, authtoken)
        self.tableOfContents = self.w.MakeToc()
        self.pages = self.w.MakeSites()
        self._fileName = fileName

    def saveBook(self):
        pdf = MyFPDF()
        pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True)
        self.tableOfContents.createPage(pdf)
        pdf.add_page()
        for it in self.pages:
            it.makePage(pdf)

        #Save pdf
        pdf.output(self._fileName)
Exemplo n.º 4
0
def scrape_for_prices(cards):

    #prices, not_found = scraper.scrape_all(cards)
    #print(prices)
    #print(not_found)
    multiverse_ids = set()
    try:
        for filename in ["prices_abc.csv", "prices_2.csv", "prices_3.csv"]:
            with open(filename, "r") as f:
                reader = csv.reader(f)
                for line in reader:
                    multiverse_ids.add(int(line[0]))

    except FileNotFoundError:
        pass

    scraper = WebScraper('https://www.mtggoldfish.com/price-download/paper/',
                         multiverse_ids)
    scraper.download_csv(cards)
Exemplo n.º 5
0
 def run(self):
     stp_words = list()
     with open('stopwords.txt') as file:
         for line in file:
             line = line.strip()
             stp_words.append(line)
     spider = WebScraper(stp_words)
     while True:
         tbd_url = self.frontier.get_tbd_url()
         if not tbd_url:
             self.logger.info("Frontier is empty. Stopping Crawler.")
             with open('ReportText.txt', 'w+') as f:
                 common_dict = spider.most_common_words()
                 f.write('Unique Pages Count: ' + str(spider.get_unique_pages_count()) + '\n')
                 f.write('\n')
                 f.write('Longest Page: \n')
                 for key, value in spider.get_longest_page().items():
                     f.write(str(key) + ' -> ' + str(value) + ' words \n')
                 f.write('\n')
                 count = 0
                 f.write('50 Most Common Words: \n')
                 for item in common_dict:
                     if count == 50:
                         break
                     else:
                         f.write(str(item[0]) + ' -> ' + str(item[1]) + '\n')
                         count += 1
                 f.write('\n')
                 f.write('Subdomains in ics.uci.edu: \n')
                 for key, value in spider.get_subdomains().items():
                     f.write(str(key) + ' -> ' + str(value) + '\n')
             break
         if self.frontier.check_url_completed(tbd_url):
             print("URL Already marked complete")
             print(tbd_url)
             print("Loading next url")
             continue
         resp = download(tbd_url, self.config, self.logger)
         if resp == None:
             self.logger.info(
                 f"{tbd_url} Timeout")
             continue
         self.logger.info(
             f"Downloaded {tbd_url}, status <{resp.status}>, "
             f"using cache {self.config.cache_server}.")
         scraped_urls = spider.scraper(tbd_url, resp)
         check_robots = self.parse_robots_txt(scraped_urls)
         for scraped_url in check_robots:
             self.frontier.add_url(scraped_url)
         self.frontier.mark_url_complete(tbd_url)
         time.sleep(self.config.time_delay)
Exemplo n.º 6
0
def start_scrape():
    """Searches for things to scrape and gets to work."""
    from scraper import WebScraper
    from db import Composer

    logging.info("Starting to scrape.")

    session = DB_SESSION()
    IS = WebScraper(session)

    # Get composers into DB
    if session.query(Composer).count() < 2000:
        IS.scrape_composer_list()

    # Scrape the pieces off composers.
    IS.scrape_all_composers()

    # Scrape all the piece info off the pages.
    IS.scrape_all_pieces()

    # Download all the scores.
    IS.download_all_scores()

    logging.info("Done downloading!")
Exemplo n.º 7
0
class SearchDocument(object):
    ## global stemming engine
    try:
        stemmer = SnowballStemmer("english")
    except Exception as e:
        print "SearchDocument class failed to initialize stemming engine: {}".format(
            e)
        stemmer = None
    ## global cache of word stemming to reduce calling snowball stemming API
    cache_stemming = {}
    scraper = WebScraper()

    ## the constructor
    #  @param fields "items" of Google search results, type: dict
    #  @param stemming (False) True to enable stemming, e.g. stem("apples") = "apple"
    #  @param htmltext (False) True to use the html text instead of snippet
    #  @param normalize (False) True to have term frequency instead of raw counts
    def __init__(self,
                 fields,
                 stemming=False,
                 htmltext=False,
                 normalize=False):
        self.title = fields['title']
        self.displink = fields['displayLink']
        self.url = fields[
            'link']  # 'link' is the complete URL, not 'formattedUrl'
        self.snippet = fields['snippet']
        self.key = self.url
        self.text = self.scraper.scrape_text(self.url) if htmltext else ""
        self.stemming = stemming

        # all words in document (with duplicates)
        if htmltext:
            self.words = self.__parse(self.text)
        else:
            self.words = self.__parse(self.title) + self.__parse(self.snippet)
        # document length
        self.size = len(self.words)
        # term count/frequency
        self.tf = self.__tf(normalize)

    ## calculate terms occurence/frequency
    def __tf(self, normalize):
        res = Counter(self.words)
        if normalize:
            for word, count in res.items():
                res[word] = float(count) / self.size
        return res

    ## apply word stemming if necessary
    def __stem(self, word):
        if not self.stemming or not self.stemmer:
            # stemming is disabled
            return word
        elif word in self.cache_stemming:
            # found in cache
            return self.cache_stemming[word]
        else:
            # stem, and save in cache
            stemmed = self.cache_stemming[word] = self.stemmer.stem(word)
            return stemmed

    ## convert string to list of lowercase words w/o punctuations
    def __parse(self, s):
        res = []
        for w in s.split():
            r = re.sub(r'[\W_]', '', self.__stem(w.lower()))
            if r:
                res.append(r)
        return res
Exemplo n.º 8
0
 def __init__(self, fileName: str, domain: str, userid: str,
              authtoken: str):
     self.w = WebScraper(domain, userid, authtoken)
     self.tableOfContents = self.w.MakeToc()
     self.pages = self.w.MakeSites()
     self._fileName = fileName
Exemplo n.º 9
0
from scraper import WebScraper
from api import GoogleCalAPI

scraper = WebScraper()
url = 'https://www.ufrgs.br/ppgbioq/category/eventos/'
scraper.get_html(url)
seminars_list = scraper.get_data()
api = GoogleCalAPI()
api.create_events(seminars_list)
Exemplo n.º 10
0
from scraper import WebScraper

# Indicar el curs a capturar entre 1314 i 1819. "ALL" per a tots.
course = "1516"

ws = WebScraper(course)

ws.scrape()
ws.write_csv()

Exemplo n.º 11
0
def start_shell():
    """Opens up an IPython window and a DB session to browse the database."""
    from scraper import WebScraper
    session = DB_SESSION()
    IS = WebScraper(session)
    IPython.embed()