Exemplo n.º 1
0
    def add_website(self, website_meta):
        '''Add the website in the database'''

        #Create a website object to check if it exists in the database
        web = Website.objects(
                    name=website_meta.get("name"),
                    homepage_url=website_meta.get("homepage_url")
                ).first()

        if web:
            return web

        #This website object is used to add to the database
        web = Website(
                name=website_meta.get("name"),
                homepage_url=website_meta.get("homepage_url")
                )
        try:
            status = web.save()
        except ValidationError:
            self.logger.warn('Save/Validate Website Failed! url: {0}'\
                                      .format(website_meta.get("homepage_url")))

        if status:
            return web
        else:
            return None
Exemplo n.º 2
0
def main():
    if len(sys.argv) != 2:
        print("Invalid command line arguments.")
        print("Usage: python3 diagnose.py <WEBSITE_URL>")
        exit()

    url = sys.argv[1]

    FILEPATH_PREFIX = "data/"
    FILEPATH_TEXT_SUFFIX_CLEAN = "_clean.txt"
    FILEPATH_TEXT_SUFFIX_BLOCK = "_block.txt"
    FILEPATH_IMAGE_SUFFIX_CLEAN = "_clean.png"
    FILEPATH_IMAGE_SUFFIX_BLOCK = "_block.png"

    txt_clean = FILEPATH_PREFIX + url + FILEPATH_TEXT_SUFFIX_CLEAN
    txt_block = FILEPATH_PREFIX + url + FILEPATH_IMAGE_SUFFIX_BLOCK
    img_clean = FILEPATH_PREFIX + url + FILEPATH_IMAGE_SUFFIX_CLEAN
    img_block = FILEPATH_PREFIX + url + FILEPATH_IMAGE_SUFFIX_BLOCK

    website_clean = Website(txt_clean, img_clean, "clean")
    website_block = Website(txt_block, img_block, "block")
    pair = WebsitePair(website_clean, website_block)

    # @TODO This if-else section can be expanded as future faults become detectable.
    if page_is_blank(website_clean, website_block, pair):
        print("Page is blank!")

    else:
        print("No faults detected.")
Exemplo n.º 3
0
    def startup(self):
        """
        Some stuff that should get called after everything is loaded.
        """
        self.env.seishub.startup()
        self.nw_tree.startup()

        # Connect some slots.
        QtCore.QObject.connect(self.nw_tree.nw_select_model,
                               QtCore.SIGNAL("selectionChanged(QItemSelection, QItemSelection)"), \
                               self.waveforms.waveform_scene.add_channel)

        web = Website(env=self.env)
        web.startup()
        # Add a WebView to later display the map.
        file = open(os.path.join(self.env.temp_res_dir, 'map.html'))
        html = file.read()
        file.close()
        self.env.web.setHtml(html)
        self.picks.update()

        css_url = QtCore.QUrl.fromLocalFile(os.path.abspath(self.env.css))

        server = '%s/manage/seismology/stations' % self.env.seishub_server
        url = QtCore.QUrl(server)
        url.setUserName(self.env.seishub_user)
        url.setPassword(self.env.seishub_password)
        # Might work with some Qt version...
        self.env.station_browser.page().settings().setUserStyleSheetUrl(css_url)
        self.env.station_browser.load(url)
        self.env.station_browser.page().settings().setUserStyleSheetUrl(css_url)
    def setUp(self):
        self.single_plan = Plan('Single', 49, 1)
        self.plus_plan = Plan('Plus', 99, 3)

        self.website_1 = Website('https://google.com')
        self.website_2 = Website('https://google.com')
        
        self.customer_1 = Customer('customer_1', '123456789', '*****@*****.**')
Exemplo n.º 5
0
    def __init__(self, name, url, internalLinkPattern, targetPattern,
                 titleSelector, priceSelector):

        Website.__init__(self, name, url, internalLinkPattern, targetPattern,
                         titleSelector)
        self.products = []
        self.termsToExclude = []
        self.priceSelector = priceSelector
Exemplo n.º 6
0
Arquivo: jd.py Projeto: wliustc/jd
 def __init__(self, user):
     Website.__init__(self, 'JD', user)
     self.login_page = LoginPage(self)
     self.activ_page = ActiPage(self)
     self.list_page = ListPage(self)
     self.main_page = MainPage(self)
     self.coupon_page = CouponPage(self)
     self.data_page = DataPage(self)
Exemplo n.º 7
0
 def __init__(self, user):
     Website.__init__(self, 'JD_mobile', user)
     self.login_page = LoginPage(self)
     self.main_page = MainPage(self)
     self.data_page = DataPage(self)
     self.charge_page = ChargePage(self)
     self.get_coupon_page = GetCouponPage(self)
     self.json_page = JsonPage(self)
Exemplo n.º 8
0
 def serializeWebsite(self, website):
     if ("sitemap" in website.keys()):
         return Website(website['homepage'],
                        website['input_dict'],
                        website['lastmod'],
                        sitemap=website['sitemap'])
     else:
         return Website(website['homepage'], website['input_dict'],
                        website['lastmod'])
Exemplo n.º 9
0
def home():
    if request.method == 'POST':
        Website(request.form['url'])
    Website.check_all()
    return render_template(
        "home.html",
        pages=Website.all,
        length=len(Website.all)
    )
Exemplo n.º 10
0
def test_fix_link(link, hostname, scheme, result):
    mock_parsed_url = Mock()
    mock_parsed_url.hostname = hostname
    mock_parsed_url.scheme = scheme
    mock_parsed_url.netloc = hostname
    website = Website('seed_url')
    assert website.fix_link(link, mock_parsed_url) == (
        result, hostname
    )
Exemplo n.º 11
0
def test_scrape(monkeypatch, page_content, links, to_visit):
    mock_response = Mock()
    mock_response.text = page_content
    mock_response.status_code = 200
    monkeypatch.setattr('website.requests.get', lambda x: mock_response)
    website = Website('http://hostname/url')
    website.scrape()
    # pages are 'http://hostname/url', 'http://hostname/new-url',
    # 'https://hostname/', 'http://hostname/', 'https://hostname/new-url'
    assert len(website.pages) == 5
Exemplo n.º 12
0
def test_scrape_url(monkeypatch, page_content, links, to_visit):
    mock_response = Mock()
    mock_response.text = page_content
    mock_response.status_code = 200
    monkeypatch.setattr('website.requests.get', lambda x: mock_response)
    website = Website('http://hostname/url')
    # Simulate visiting the page.
    url, _ = website.to_visit.popitem()
    website.scrape_url(url)
    assert website.to_visit == OrderedDict((key, None) for key in to_visit)
    assert website.pages[url].links == links
Exemplo n.º 13
0
    def __init__(self,
                 url,
                 keywords=None,
                 searchPageLimit=2,
                 websitesJsonFile="websites.json",
                 isInitialCrawl=False):

        # iinitialize class attributes
        self.baseUrl = url
        self.keywords = keywords
        self.articleLinks = []
        self.articleCount = 0
        self.searchPageLimit = searchPageLimit
        self.websitesJsonFile = websitesJsonFile
        self.isInitialCrawl = isInitialCrawl

        # instantiate a Website object to interact with the website to be crawled
        try:
            self.website = Website(url, websitesJsonFile=self.websitesJsonFile)

        # raise exception if there is an error connecting to the website
        except WebsiteFailedToInitialize:
            raise WebsiteFailedToInitialize(url)

        # open the json file containing websites and their attributes
        with open(self.websitesJsonFile) as data_file:
            self.websites = json.load(data_file)
            data_file.close()

        # set the searchQuery attribute to the appropriate search query structure in the websites json file
        for website, attributes in self.websites.items():
            if website in self.baseUrl:
                self.searchQuery = attributes["searchQuery"]
                self.nextPageType = attributes["nextPage"]

        # populate the exceptions attribute list with websites who's article urls need to be manually
        # crawled
        self.exceptions = [
            "https://www.ourmidland.com/", "https://www.lakecountystar.com/",
            "https://www.northernexpress.com/", "https://www.manisteenews.com/"
        ]

        print("\r" + bcolors.OKGREEN + "[+]" + bcolors.ENDC + " Crawling " +
              self.baseUrl + "..." + bcolors.ENDC,
              end="")
        sys.stdout.flush()

        # start crawling
        self.crawl()

        print("\r" + bcolors.OKGREEN + "[+]" + bcolors.ENDC + " Crawled " +
              self.baseUrl + ": " + bcolors.OKGREEN +
              str(len(self.articleLinks)) + " URLs retrieved" + bcolors.ENDC)
Exemplo n.º 14
0
def test_find_links(page_content, hostname, scheme, links, to_visit):
    mock_parsed_url = Mock()
    mock_parsed_url.hostname = hostname
    mock_parsed_url.scheme = scheme
    mock_parsed_url.netloc = hostname
    website = Website('http://hostname/url')
    # Simulate visiting the page.
    website.to_visit.popitem()
    page = Page('a_url')
    bs = BeautifulSoup(page_content, 'html.parser')
    website.find_links(page, bs, mock_parsed_url)
    assert page.links == links
    assert website.to_visit == OrderedDict((key, None) for key in to_visit)
Exemplo n.º 15
0
def process_single_website(website_url):
    """Processes a single website and exports to csv string.
	"""
    txt_clean = FILEPATH_PREFIX + website_url + FILEPATH_TEXT_SUFFIX_CLEAN
    txt_block = FILEPATH_PREFIX + website_url + FILEPATH_IMAGE_SUFFIX_BLOCK
    img_clean = FILEPATH_PREFIX + website_url + FILEPATH_IMAGE_SUFFIX_CLEAN
    img_block = FILEPATH_PREFIX + website_url + FILEPATH_IMAGE_SUFFIX_BLOCK

    website_clean = Website(txt_clean, img_clean, "clean")
    website_block = Website(txt_block, img_block, "block")
    pair = WebsitePair(website_clean, website_block)

    print(get_csv_header(website_clean, website_block, pair))
Exemplo n.º 16
0
def run_website():
    website = Website()

    @website.route('/')
    def index():
        return 200, 'users list'

    @website.route('/users/([0-9]+)')
    def user(user_id):
        if user_id not in ['1', '2']:
            return 404, ''
        return 200, f'user {user_id}'

    website.run(_ADDRESS)
Exemplo n.º 17
0
    def __init__(self):
        self.w = Website()
        self.root = Tk()
        self.root.title("Auto site - Enter the fields")
        self.my_font = tkFont.Font(family="Helvetica", size=11)

        self.frame = Frame(self.root, height=800, width=800, \
                           padx=50, pady=10)
        self.frame.pack()

        self.fields()
        self.buttons()

        self.root.mainloop()
Exemplo n.º 18
0
def main():
    # Initialize different plans
    single_plan = Plan('Single', 49, 1)
    plus_plan = Plan('Plus', 99, 3)
    infinite_plan = Plan('Infinite', 249, -1)

    # Initialize multiple websites
    website_1 = Website('https://website_1.com')
    website_2 = Website('https://website_2.com')
    website_3 = Website('https://website_3.com')
    website_4 = Website('https://website_4.com')

    # Initialize multiple customers
    customer_1 = Customer('customer_1', '123456789', '*****@*****.**')
    customer_2 = Customer('customer_2', '123456789', '*****@*****.**')
    customer_3 = Customer('customer_3', '123456789', '*****@*****.**')

    # customer_1 subscribed for single_plan
    customer_1.add_subscription(single_plan)
    print("{} has subscribed for {} plan".format(customer_1,
                                                 customer_1.subscription.plan))

    # customer_1 added one website
    customer_1.add_website(website_1)
    print("{} has added website {} as per the {} plan".format(customer_1, \
            customer_1.websites, customer_1.subscription.plan))

    # customer_1 can not add more website in single_plan
    customer_1.add_website(website_2)
    print("{} can't add website {} as per the {} plan".format(customer_1, \
            website_2, customer_1.subscription.plan))

    # customer_1 can change plan from single_plan to plus_plan
    customer_1.change_plan(plus_plan)
    print("{} has changed his current plan {} to {} plan".format(customer_1, \
            single_plan, customer_1.subscription.plan))

    # customer_2 subscribe for infinite_plan
    customer_2.add_subscription(infinite_plan)

    # customer_2 can add multiple websites
    customer_2.add_website(website_1)
    customer_2.add_website(website_2)
    customer_2.add_website(website_3)
    customer_2.add_website(website_4)

    print("{} has added four websites {} under infinite plan".format(customer_2, \
            customer_2.websites))
Exemplo n.º 19
0
    def go(self):
        self.work_pages(self.site)

        self.session.add(Website(url=self.site, title='', domain=self.site,
                                 pages_count=self.pages_count, HTML_version=0.0))

        self.session.commit()
Exemplo n.º 20
0
    def __init__(self, scope: core.Construct, id: str, **kwargs) -> None:
        super().__init__(scope, id, **kwargs)

        ticker = dynamodb.Attribute(
            name='Ticker',
            type=dynamodb.AttributeType.STRING,
        )

        date = dynamodb.Attribute(
            name='Date',
            type=dynamodb.AttributeType.STRING,
        )

        table = dynamodb.Table(
            self,
            'StockHistory',
            partition_key=ticker,
            sort_key=date,
            billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST,
            removal_policy=core.RemovalPolicy.DESTROY,
            point_in_time_recovery=True,
        )

        index_name = 'Date-index'
        table.add_global_secondary_index(
            index_name=index_name,
            partition_key=date,
            sort_key=ticker,
            projection_type=dynamodb.ProjectionType.INCLUDE,
            non_key_attributes=['Name'])

        Importer(self, 'Importer', table=table)
        restapi = RestApi(self, 'Api', table=table, index_name=index_name)
        Website(self, 'Website', api=restapi.api)
Exemplo n.º 21
0
    def generate_relation_dict(self, news_sources, news_targets):
        '''
        generates a dictionary of string/list(int) in the format
        {source : target_count}
        ie. {s1 : [tc1, tc2, ... tcn],
        s2 : [tc1, tc2, ... tcn], ...
        sn : [tc1, tc2, ... tcn]}
        where sn is the source, tcn is the citation count of each target
        '''
        # initialize the relation dictionary.
        relation_dict = {}

        for source_name, source_url in news_sources.iteritems():
            # create an empty list with a specific size which describe the number
            # of target referenced by each source
            target_count = [0] * len(news_targets)
            # Find the articles which have a specific source website url
            articles = Article.objects(
                Q(website=Website.objects(homepage_url=source_url).only('homepage_url').first()) &
                Q(citations__exists=True)).only('citations')
            for article in articles:
                # Count the times that each target in the news_targets is in the
                # citation list for each article and put it in the target_count
                for citation in article.citations:
                    if not isinstance( citation, int ):
                        i = 0
                        while i < len(news_targets):
                            if citation.target_name.lower() == news_targets.keys()[i].lower():
                                target_count[i] += 1
                            i += 1
            relation_dict[source_name] = target_count
        return relation_dict
Exemplo n.º 22
0
def main():
    logger.info("Cartriage v5.0")
    parser = argparse.ArgumentParser(
        description="Retrieves information from printers.")
    parser.add_argument(
        "l",
        type=open,
        metavar="printers",
        help="Text file containing printer IP addresses, one for each line.")
    parser.add_argument("o",
                        metavar="output",
                        help="Filename for resulting HTML page.")
    parser.add_argument("-v", action="store_true", help="Enable verbose mode.")
    try:
        args = parser.parse_args()
        if args.v:
            logger.info("Enabled verbose mode")
            logger.setLevel(logging.DEBUG)
        logger.debug(args)
        startTime = time.time()
        time.clock()
        scanned, successfullyScanned, printers = runScan(args.l)
        elapsedTime = "%d seconds" % (time.time() - startTime)
        site = Website(scanned, successfullyScanned, printers, elapsedTime)
        with open(args.o, "w") as output:
            output.write(str(site))
        logger.info("Done! Results available in file: %s" % args.o)
        sys.exit(0)
    except IOError, e:
        logger.error(str(e))
        sys.exit(1)
    def get_website(self, url: str, check_interval: int):
        """
        Instantiates Website instance. Safely returns 
        instance or None depending on success.

        PARAMETERS: check_interval: Positive integer in seconds.
                        Ping refresh freuency e.g. 30 would 
                        equate to check every 30 seconds
                    url: String e.g. http://google.fr
        Instantiates Website instance.

        RETURNS: Website instance or None.
        """
        try:

            website = Website(url=url, check_interval=check_interval)

        except Exception:
            print(
                "I wasn't able to connect with that URL.\n"
                + "Please revise it, including 'http://'"
                + " or 'https://' as appropriate)."
            )
            return None

        return website
Exemplo n.º 24
0
 def __init__(self, url, params, headers, cnx, cursor):
     self.url = url
     self.params = params
     self.headers = headers
     self.cnx = cnx
     self.cursor = cursor
     self.website = Website(url, params, headers, cursor, cnx)
Exemplo n.º 25
0
    def get_articles(self, number=None):
        global username

        show_article_template = Template(filename='get_articles.html')
        sources = User.objects(name=username).first().news_sources
        targets = User.objects(name=username).first().news_targets
        articles = []

        for s in sources:
            articles += Article.objects(website=Website.objects(name=s).first()).only('title', 'url').all()
        for t in targets:
            articles += Article.objects(website=Website.objects(name=t).first()).only('title', 'url').all()

        if not number:
            number = len(articles)

        return show_article_template.render(articles=articles[ :int(number)])
Exemplo n.º 26
0
def test_city():
    cities = []
    for city in URL_CITY_ARRAY:
        location = Website(city)
        location.set_directory('./wikipedia/')
        html = location.get_html()
        cities.append(location)
    """
    cities = []
    fact_book = []
    #for country in CIA_FACT_BOOK:
    #    CIA.cia_indexer(Website(country))
    for city in URL_CITY_ARRAY:
        cities.append(Website(city))
    for city in cities:
        wikipedia.wiki_study_city(city)
    """
    return
Exemplo n.º 27
0
    def getWebsites(self):
        websites = dict()

        with open('data1/websites.json') as data_file:
            websitesData = json.load(data_file)['websites']

        for website in websitesData:
            websites[website['id']] = Website(website)

        return websites
Exemplo n.º 28
0
def process_manifest():
    """Processes all websites in the manifest.
	"""
    m = manifest.MANIFEST
    for i in range(0, len(m)):
        entry = m[i]

        txt_clean = FILEPATH_PREFIX + entry[0] + FILEPATH_TEXT_SUFFIX_CLEAN
        txt_block = FILEPATH_PREFIX + entry[0] + FILEPATH_IMAGE_SUFFIX_BLOCK
        img_clean = FILEPATH_PREFIX + entry[0] + FILEPATH_IMAGE_SUFFIX_CLEAN
        img_block = FILEPATH_PREFIX + entry[0] + FILEPATH_IMAGE_SUFFIX_BLOCK

        website_clean = Website(txt_clean, img_clean, "clean")
        website_block = Website(txt_block, img_block, "block")
        pair = WebsitePair(website_clean, website_block)

        if i == 0:
            print(get_csv_header(website_clean, website_block, pair))

        print(get_csv_string(website_clean, website_block, pair))
Exemplo n.º 29
0
    def generate_text(self, sites_file, search_limit, keep_to_sites):
        with open(sites_file, "r") as f:
            for site in f.readlines():
                sys.stderr.write("Working on: " + site + '\n')
                ws = Website(home_page=site,
                             search_limit=search_limit,
                             keep_to_site=keep_to_sites)
                self.sites.append(ws)

        for site in self.sites:
            for link in site.links:
                self.text += site.get_page_text(link)
Exemplo n.º 30
0
def read_file(filename):
    """
    Reads a file and returns a list of Website objects
    """
    lines = []
    with open(filename) as file:
        for line in file:
            url, interval = line.split(' ')
            interval = int(interval)
            website = Website(url, interval)
            lines.append(website)
    return lines
Exemplo n.º 31
0
def start():
    """Fetch list of web pages asynchronously."""
    websites = Website.all()
    start_time = default_timer()

    loop = asyncio.new_event_loop()  # create event loop
    asyncio.set_event_loop(loop)  # set event loop
    future = asyncio.ensure_future(fetch_all(websites))  # tasks to do
    loop.run_until_complete(future)  # loop until done

    tot_elapsed = default_timer() - start_time
    helper.p('Total time: {0:5.2f}'.format(tot_elapsed))
Exemplo n.º 32
0
class TopBetEu:
    def __init__(self, url, headers, params, cnx, cursor):
        self.url = url
        self.params = params
        self.headers = headers
        self.cnx = cnx
        self.cursor = cursor
        self.website = Website(url, params, headers, cursor, cnx)

    def parse(self):
        # get the webpage soup
        soup = self.website.soup()

        # for each game
        games = []
        for eventdiv in soup.find_all('div', class_='event'):
            # read the game header
            header = eventdiv.find('h3').text

            # read the teams
            match = re.search('(\w.+) at (\w.+) ', header)
            awayteamstr = re.sub('-.+', '', match.group(1).replace('-N','').replace('-A','')).replace('.', '%')
            hometeamstr = re.sub('-.+', '', match.group(2).replace('-N','').replace('-A','')).replace('.', '%')

            # read the gametime
            match = re.search('(....)-(..)-(..)\s+(..):(..)', header)
            gametime = datetime(
                int(match.group(1)),
                int(match.group(2)),
                int(match.group(3))
            )

            # read the lines
            awaycell, homecell = [line for line in eventdiv.find_all('td', class_='money')]
            awayline = int(awaycell.text)
            homeline = int(homecell.text)

            games.append((
                awayteamstr,
                awayline,
                hometeamstr,
                homeline,
                gametime,
            ))

        return games

    def __repr__(self):
        return '{0}({1},{2},{3})'.format(self.__class__.__name__,
                                         self.url,
                                         self.params,
                                         self.headers)
Exemplo n.º 33
0
 def getSites(self):
     global conn
     global cur
     cur.execute("SELECT * FROM sites")
     sitesData = cur.fetchall()
     allSiteObjs = []
     for site in sitesData:
         siteObj = Website(site['id'], site['name'], site['url'],
                           site['searchUrl'], site['resultListing'],
                           site['resultUrl'], site['absoluteUrl'],
                           site['pageTitle'], site['pageBody'])
         allSiteObjs.append(siteObj)
     return allSiteObjs
Exemplo n.º 34
0
def main():
    """
    Main producer pipeline
    :return: None
    """
    db_credentials = form_db_credentials()
    message_broker_credentials = form_message_broker_credentials()
    create_tables(db_credentials, tables)
    while True:
        data_to_check = get_data_to_check(db_credentials)
        time_to_sleep = get_sleep_time(db_credentials)
        if not data_to_check:
            sleep(time_to_sleep)
            logger.info(f'sleeping {time_to_sleep} seconds')
            continue
        tasks = []
        for row in data_to_check:
            url, regexp = row
            website = Website(url, message_broker_credentials, regexp)
            tasks.append(website.perform_check())
        loop = asyncio.get_event_loop()
        loop.run_until_complete(asyncio.gather(*tasks))
        logger.info(f'sleeping {time_to_sleep} seconds')
        sleep(time_to_sleep)
Exemplo n.º 35
0
def analyse_URL(jsonData):
    """
    Decide whether a website is phishing using its keywords and a Google search
    based on those.

    Parameters
    ----------
    jsonData: contains site data

    """

    ws = Website(json=jsonData)

    print(datetime.now().strftime("%H:%M:%S.%f") + "-- building vector")

    # build feature vector

    feat_vec_temp = {}
    feat_vect_site = build_feat_vec.feature_vector(extractor, ws)
    feat_vec_temp[0] = feat_vect_site
    feat_vect = DataFrame(feat_vec_temp)
    feat_vect = feat_vect.transpose().fillna(0)

    # prediction using gradient boosing
    exp = "238"

    features = feat_vect.columns

    print(datetime.now().strftime("%H:%M:%S.%f") +
          "-- vector done, start gradient boosting:")

    scoregb, predictiongb = _predict_gb(1, feat_vect, features, exp)
    gb_results = scoregb, predictiongb

    print(datetime.now().strftime("%H:%M:%S.%f") + "-- gradient done")
    global keep_track
    if keep_track:
        if gb_results[1] == 1:
            JSONtoFile(jsonData, True, jsonData['siteid'])
        else:
            JSONtoFile(jsonData, False, jsonData['siteid'])

    return gb_results, jsonData['jspageid'], jsonData['siteid']
Exemplo n.º 36
0
 def generate_relation_dict_beta(self, news_sources, news_targets):
     relation_dict = {}
     for source_name in news_sources:
         # create an empty list with a specific size which describe the number
         # of target referenced by each source
         target_count = [0] * len(news_targets)
         # Find the articles which have a specific source website url
         articles = Article.objects(
             Q(website=Website.objects(name=source_name).only('name').first()) &
             Q(citations__exists=True)).only('citations')
         for article in articles:
             # Count the times that each target in the news_targets is in the
             # citation list for each article and put it in the target_count
             for citation in article.citations:
                 if not isinstance( citation, int ):
                     i = 0
                     while i < len(news_targets):
                         if citation.target_name.lower() == news_targets[i].lower():
                             target_count[i] += 1
                         i += 1
         relation_dict[source_name] = target_count
     return relation_dict
Exemplo n.º 37
0
def test_get_alexa_rank(website):
    websiteInstance = Website("www.gwern.net", None)
    print websiteInstance.get_alexa_rank("www.gwern.net")
    print websiteInstance.get_alexa_rank("www.a16z.com")
    print websiteInstance.get_alexa_rank("www.lesswrong.com")
    print websiteInstance.get_alexa_rank("www.facebook.com")
    print websiteInstance.get_alexa_rank("www.amazon.com")
Exemplo n.º 38
0
#! /usr/bin/python

from website import Website
import sys

siteObj = Website(sys.argv[1:])
siteObj.baseSearchURL = 'https://mail.google.com/mail/u/0/#inbox'
Exemplo n.º 39
0
class Pinnacle:
    def __init__(self, url, params, headers, cnx, cursor):
        self.url = url
        self.params = params
        self.headers = headers
        self.cnx = cnx
        self.cursor = cursor
        self.website = Website(url, params, headers, cursor, cnx)

    def parse(self):
        print 'parsing:', str(self)

        # get the webpage soup
        soup = self.website.soup()

        # the datatables
        tables = soup.find_all('table', class_='linesTbl')

        # slurp up rows (they come in groups of three)
        gamerows = {}
        for table in tables:
            # get the date for this table
            datestr = table.select('.linesHeader')[0].find('h4').text
            match = re.search('(\d{0,2})/(\d{0,2})', datestr)
            month = int(match.group(1))
            day = int(match.group(2))
            date = datetime.date(2015, month, day)
            gamerows[date] = []

            # sigh, go through all colors of table
            for row in table.select('.linesAlt1'):
                gamerows[date].append(row)
            for row in table.select('.linesAlt2'):
                gamerows[date].append(row)

        # group rows into 3 tuples
        # http://code.activestate.com/recipes/303060-group-a-list-into-sequential-n-tuples/
        gametuples = {}
        for date in gamerows:
            gametuples[date] = []
            for i in range(0, len(gamerows[date]), 3):
                tup = gamerows[date][i:i+3]
                if len(tup) == 3:
                    gametuples[date].append(tuple(tup))

        # go through for times and lines
        lines = []
        for date in gametuples:
            for linerowa, linerowb, draw in gametuples[date]:
                # get the lines
                lineaname = linerowa.select('.linesTeam')[0].text
                linebname = linerowb.select('.linesTeam')[0].text
                linealine = float(linerowa.select('.linesMLine')[0].text or -1)
                linebline = float(linerowb.select('.linesMLine')[0].text or -1)
                drawline = float(draw.select('.linesMLine')[0].text or -1)
                
                lines.append((lineaname, linealine, linebname, linebline, drawline, datetime.datetime.combine(date, datetime.time())))

        return lines

    def __repr__(self):
        return '{0}({1})'.format(self.__class__.__name__,
                                 self.website)