Пример #1
0
	def addComment(page_id, username, comment_text):
		"""
		Adds one to the current value of the global counter.

		Returns the new value of the counter.
		"""
		first_row = utils.query("SELECT intCommentNumber FROM tb_Comments WHERE intCommentNumber = (SELECT MAX(intCommentNumber) FROM tb_Comments WHERE chvPage = ?) AND chvPage = ?", (page_id, page_id), True)
		highestCommentNumber = 0
		if first_row:
			highestCommentNumber = int(first_row['intCommentNumber'])
		utils.query("INSERT INTO tb_Comments VALUES (?, ?, CURRENT_TIMESTAMP, ?, ?, 0, NULL)", (page_id, highestCommentNumber + 1, username, comment_text))
		return 'Success'
Пример #2
0
    def download_repository(self, attempt, zip_name):
        url = BASE_URL.format(name = attempt.repo.repo_name())
        response = utils.query(url)
        data = response.text
        download_url = re.search('https://[^ ]*?\.zip', data).group(0)

        response = utils.query(download_url)
        zip_file = open(zip_name, 'wb')
        for chunk in response.iter_content(chunk_size=1024): 
            if chunk:
                zip_file.write(chunk)
                zip_file.flush()
        zip_file.close()
    # DEF
Пример #3
0
    def relevant_page_ids(self, fresh=False, filename='../workspace/relevant_page_ids.p'):
        """
        Collect the wiki id's (page_ids) for all pages belonging to the self.relevant_categories
            * if fresh: fresh download from Wikipedia + pickled under filename
            * else: no download; page_ids loaded from pickle in filename
        """
        print('>>> Getting page_ids from relevant categories')
        self.page_ids = set()

        if fresh:
            for category in self.relevant_categories:
                for result in query( {'generator':'categorymembers', 'gcmtitle':'Category:'+category, 'gcmlimit':'500'}):
                    for page in result['pages']:
                        page_id = result['pages'][page]['title']
                        self.page_ids.add(page_id)
                        if len(self.page_ids)%1000 == 0:
                            print('\t+ nb pages download:', len(self.page_ids))

            self.page_ids = sorted(self.page_ids)
            pickle.dump(self.page_ids, open(filename, 'wb'))

        else:
            self.page_ids = pickle.load(open(filename, 'rb'))

        print('\t+ set', len(self.page_ids), 'page_ids')

        return self.page_ids
Пример #4
0
    def search(self):
        # Load and parse!
        response = utils.query(self.next_url())
        soup = BeautifulSoup(response.text)
        titles = soup.find_all(class_='node-project-distribution')
        LOG.info("Found %d repositories" % len(titles))

        # Pick through the results and find repos
        for title in titles:
            name = title.contents[1].contents[0]['href'].split('/')[2]
            try:
                self.add_repository(name, '')
            except:
                traceback.print_exc()
            # Sleep for a little bit to prevent us from getting blocked
            time.sleep(DRUPAL_SLEEP)
        ## FOR

        # Figure out what is the next page that we need to load
        try:
            next_page = soup.find(class_='pager-next').contents[0]
        except:
            next_page = None
        if not next_page or not next_page.has_attr('href'):
            LOG.info("No next page link found!")
            self.crawlerStatus.next_url = None
        else:
            self.crawlerStatus.next_url = DRUPAL_HOST + next_page['href']

        # Make sure we update our crawler status
        LOG.info("Updating status for %s" % self.crawlerStatus)
        self.crawlerStatus.save()
            
        return
Пример #5
0
def handle(builds, environ, cmd=None):
    error = None

    try:
        settings = json.loads(query(environ, 'settings', '{}'))
    except:
        log.exception("Error in json parsing the settings variable")
        error = escape(make_trace())
        settings = {}

    for e in sorted(settings_validator.iter_errors(settings)):
        if error is None:
            error = ""
        error += str(e) + "\n"

    if error is not None:
        log.error("Errors from schema: " + error)
        yield '<result>\n<error>' + error + '</error>\n</result>\n'
    else:
        incremental = query(environ, 'incremental', '')
        incremental = incremental.lower() == 'true'

        try:
            if cmd == "makefile":
                log.info("Returning makefile")
                yield makefile(settings)
            elif cmd == "join":
                log.info("Joining existing build")
                yield "<result>\n"
                hashnumber = query(environ, 'hash', '')
                for k in join_from_hash(builds, hashnumber, incremental):
                    yield k
            else:
                log.info("Starting a new build")
                yield "<result>\n"
                for k in build(builds, text(environ), settings, incremental, "xml"):
                    yield k
        except:
            trace = make_trace()
            log.exception("Error in handle")
            yield '<trace>' + escape(trace) + '</trace>\n'
            yield '</result>\n'
def main():
    vw = []
    sl = []
    while True:
        inp = raw_input("> ")

        inp = inp.strip()
        words = inp.split()

        cmd = words[0]
        if cmd == "/save":
            for temp in vw:
                temp.finish()
            sys.exit(1)
        if cmd == "/train":
            data = " ".join(words[1:]).strip()
            for i in range(10):
                for temp in sl:
                    temp.learn(preprocess([data]))
        elif cmd == "/query":
            data = " ".join(words[1:]).strip()
            output = set()
            for s in sl:
                output.add(postprocess(query(s, data)))
            for out in output:
                print "\t", out
        elif cmd == "/start":
            data = " ".join(words[1:]).strip()
            if os.path.isfile(data + ".1") and os.path.isfile(data + ".2") and os.path.isfile(
                            data + ".3") and os.path.isfile(data + ".4"):
                vw = [
                    pyvw.vw("--quiet -i " + data + ".1 -f "+data + ".1"),
                    pyvw.vw("--quiet -i " + data + ".2 -f "+data + ".2"),
                    pyvw.vw("--quiet -i " + data + ".3 -f "+data + ".3"),
                    pyvw.vw("--quiet -i " + data + ".4 -f "+data + ".4")
                ]
            else:
                vw = [
                    pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".1"),
                    pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".2"),
                    pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".3"),
                    pyvw.vw("--search 3 --quiet --search_task hook --ring_size 2048 -f " + data + ".4")
                ]
            sl = [
                vw[0].init_search_task(SequenceLabeler),
                vw[1].init_search_task(SequenceLabeler2),
                vw[2].init_search_task(SequenceLabeler3),
                vw[3].init_search_task(SequenceLabeler4)
            ]
Пример #7
0
    def backlinking_pages(self, page_ids=None, ignore_categories=None, fresh=False, filename='../workspace/backlinks.p'):
        """
        Sets a dict (backlinks), with for each page_id a set of the pages backlinking to it.
            * if fresh: fresh download + pickle under outfilename
            * else: no download; backlinks loaded from pickle in filename
        In the case of new download:
            * if page_ids = None, self.page_ids is used
            * categories starting with one of the items in ignore_categories will be ignored
        """
        self.backlinks = {}

        if fresh:
            if not page_ids:
                page_ids = self.page_ids
            print('>>> Collecting backlinks for', len(page_ids), 'pages')

            if not ignore_categories:
                ignore_categories = ('Gebruiker:', 'Lijst van', 'Portaal:', 'Overleg', 'Wikipedia:', 'Help:', 'Categorie:')

            for idx, page_id in enumerate(page_ids):
                self.backlinks[page_id] = set()
                for result in query({'action':'query', 'list':'backlinks', 'format':'json', 'bltitle':page_id}):
                    for backlink in result['backlinks']:
                        backlink = backlink['title'].replace('_', ' ') # clean up
                        if not backlink.startswith(ignore_categories):
                            self.backlinks[page_id].add(backlink)
                if idx % 10 == 0:
                    print('\t+ collected', sum([len(v) for k,v in self.backlinks.items()]), 'backlinks for', idx+1, 'pages')

            self.backlinks = {k:v for k,v in self.backlinks.items() if v} # remove pages without relevant backlinks
            pickle.dump(self.backlinks, open(filename, 'wb')) # dump for later reuse

        else:
            self.backlinks = pickle.load(open(filename, 'rb'))

        print('\t+ loaded', sum([len(v) for k,v in self.backlinks.items()]), 'backlinks for', len(self.backlinks), 'pages')
Пример #8
0
    pf = PackageFinder(find_links=[], index_urls=host, use_wheel=True, allow_external=[], allow_unverified=[], allow_all_external=False, allow_all_prereleases=False, process_dependency_links=False, session=session,)

    location = [Link(url, trusted=True)]
    req = InstallRequirement.from_line(package, None)
    versions = []
    for page in pf._get_pages(location, req):
        versions = versions + [version for _, _, version in pf._package_versions(page.links, package)]
    return versions

if __name__ == '__main__':
# e.g. add a new location
    url = "https://pypi.python.org/simple/"
    print url
    while True:
        #response = urllib2.urlopen(url)
        response = query(url)
        soup = BeautifulSoup(response.read())
        for link in soup.find_all("a"):
            package = link.get('href')
            try:
                versions = get_versions(package)
            except:
                traceback.print_exc()
                continue
            for version in versions:
                #package_type = Type.objects.get(app_type = 'Django: Library')
                pkg, created = Package.objects.get_or_create(package_type=Type(name='Django'), name=package, version=version)
                if created:
                    print "found new package: " + package + "==" + version
                else:
                    print "package already exist: " + package + "==" + version
Пример #9
0
 def github_query(self, url):
     return utils.query(url, auth = self.auth)
Пример #10
0
 def dustcube_info(self):
     qresults = ut.query(float(self.glon), float(self.glat), coordsys='gal')
     if not qresults['success']:
         raise RuntimeWarning(
             'No successful distance determination in dust cube!')
     return qresults
Пример #11
0
	def getAllComments():
		"""Returns all comments."""
		return utils.query("SELECT * FROM tb_Comments")
Пример #12
0
	def deleteComment(page_id, page_number):
		"""Deletes the page_number-th column from comments of page_id."""
		utils.query("UPDATE tb_Comments SET bIsDeleted=1, dtmDeleted=CURRENT_TIMESTAMP WHERE chvPage=? AND intCommentNumber=?", (page_id, page_number))
		return 'Success'
Пример #13
0
	def getComments(page_id):
		"""
		Returns the current value of the global counter.
		"""
		return json.dumps(utils.query("SELECT * FROM tb_Comments WHERE chvPage = ?", page_id))
Пример #14
0
 def get_latest_sha(self, repo):
     url = BASE_URL.format(name = repo.repo_name())
     response = utils.query(url)
     data = response.text
     results = re.findall(COMMIT_URL.format(sha='(\d+)'), data)
     return results[1]