Exemplo n.º 1
0
	def __init__(self, phenny):
		Download.__init__(self)
		self.re_title = re.compile('ustream\.vars\.channelTitle\=\"(.*?)\"\;ustream\.vars\.')
		self.re_channel = re.compile('ustream\.vars\.channelId\=(\d*?)\;ustream\.vars\.')
		self.apikey = phenny.config.ustreamdevapikey
		self.urltype = None
		self.h = {}
Exemplo n.º 2
0
 def __init__(self):
     Download.__init__(self)
     self.re_idblob = re.compile("si:\"(.+?)\",")
     self.re_title = re.compile(
         "<title>(.*?) \|  Video on TED\.com<\/title>")
     self.urltype = VidType.TED
     self.h = {}
Exemplo n.º 3
0
    def doRequest(self):
        d = Download(self.Url)
        if d.doRequest():
            return 1

        self.recs = d.getSOURCE()
        return 0
Exemplo n.º 4
0
 def __init__(self):
     """
     Classe per gestire la copia dei dati da una forma
     /dataset/class1/
     /dataset/class1/test
     To
     /train/class1
     test/class2
     """
     #variabile dove si trova il dataset
     self.data_dir = "/Users/Eric/Desktop/eric/Programmazione/python/DeepLearning/data/knifey-spoony"
     #Url dove posso scaricare un dataset
     self.data_url = "https://github.com/Hvass-Labs/knifey-spoony/raw/master/knifey-spoony.tar.gz"
     #path della cartella di train
     self.train_dir = os.path.join(self.data_dir, "train/")
     #path della cartella di test
     self.test_dir = os.path.join(self.data_dir, "test/")
     #dimensione immagine
     self.image_size = 200
     #canali immagine
     self.num_channels = 3
     self.img_shape = [self.image_size, self.image_size, self.num_channels]
     self.img_size_flat = self.image_size * self.image_size * self.num_channels
     #numero di classi del dataset
     self.num_classes = 3
     self.download = Download()
Exemplo n.º 5
0
 def dl(self):
     """
     Downloads the highest Quallitiy picture available.
     returns False if something goes wrong.
     """
     if(self.orig_url == ""):
         if(self.hq_url == ""):
             down = Download(self.lq_url, self.config.get_image_folder())
             if(down.perform()):
                 return True
         else:
             down = Download(self.hq_url, self.config.get_image_folder())
             if(down.perform()):
                 return True
     else:
         down = Download(self.orig_url, as_var=True)
         if(down.perform()):
             result = down.get_result()
             soup = BeautifulSoup(result.getvalue())
             download_link = soup.find("a", text="this link")
             orig_url = self.dl_url_base + download_link["href"]
             time.sleep(120)
             down = Download(orig_url, self.config.get_image_folder())
             if(down.perform()):
                 self.file_name = down.get_output_name()
                 return True
     return False
Exemplo n.º 6
0
 def doRequest(self):
   d = Download(self.Url)
   if d.doRequest():
     return 1
   
   self.recs = d.getSOURCE()
   return 0
Exemplo n.º 7
0
 def __init__(self):
     Download.__init__(self)
     self.re_id = re.compile("swfobject\.embedSWF\(\"(.*?)\"")
     self.re_title = re.compile(
         "\<title\>MIT TechTV \&ndash\; (.*?)\<\/title\>")
     self.urltype = VidType.MITTECHTV
     self.h = {}
Exemplo n.º 8
0
 def __init__(self):
     Download.__init__(self)
     self.re_id = re.compile(
         "<meta\s+name\=\"item\-id\"\s+content\=\"(.*?)\"")
     self.re_title = re.compile(
         "<meta\s+name\=\"title\"\s+content\=\"(.*?)\"")
     self.urltype = VidType.VEOH
     self.h = {}
Exemplo n.º 9
0
 def __init__(self):
     Download.__init__(self)
     self.re_vid = re.compile(
         '<meta property="og:video"\s+content="http://blip\.tv/play/(.*?)"/>'
     )
     self.re_title = re.compile('<title>(.*?)</title>')
     self.urltype = VidType.BLIPTV
     self.h = {}
Exemplo n.º 10
0
 def __init__(self):
     Download.__init__(self)
     self.re_live_id = re.compile('stream_id \= (\d+?)\;')
     self.re_recorded_id = re.compile('full_program_clipid \= (\d+?)\;')
     self.re_live_title = re.compile('stream_title \= "(.+?)"\;')
     self.re_recorded_title = re.compile('full_program_title \= "(.+?)"\;')
     self.type = None
     self.urltype = None
     self.h = {}
Exemplo n.º 11
0
	def __init__(self):
		Download.__init__(self)
		self.re_live_id = re.compile('stream_id \= (\d+?)\;')
		self.re_recorded_id = re.compile('full_program_clipid \= (\d+?)\;')
		self.re_live_title = re.compile('stream_title \= "(.+?)"\;')
		self.re_recorded_title = re.compile('full_program_title \= "(.+?)"\;')
		self.type = None
		self.urltype = None
		self.h = {}
Exemplo n.º 12
0
def isGoogleSearch(schema, ip):
  d = Download(schema + '://' + ip)
  if d.doRequest():
    return False

  if Utility.containsGoogle(d.getSOURCE()):
    return True
  
  return False
Exemplo n.º 13
0
	def __init__(self, phenny):
		Download.__init__(self)
		self.re_id = re.compile("^[a-zA-Z0-9\_\-]{11}$")
		self.re_fragment = re.compile("^t\=((\d+)h)?((\d+)m)?((\d+)s)?$")
		self.urltype = VidType.YOUTUBE
		self.h = {}
		self.gdatahost = 'gdata.youtube.com'
		self.developer_key = phenny.config.youtubedevapikey
		self.gdataxml = None
Exemplo n.º 14
0
def isGoogleSearch(schema, ip):
    d = Download(schema + '://' + ip)
    if d.doRequest():
        return False

    if Utility.containsGoogle(d.getSOURCE()):
        return True

    return False
Exemplo n.º 15
0
    def requestHtml(self):
        url = self.BaseUrl + self.ISBN
        # print url, self.User_Agent
        d = Download(url, self.User_Agent)
        if d.doRequest():
            return 1

        self.HTML = d.getSOURCE()

        return 0
Exemplo n.º 16
0
    def run(self):
        url = self.BASE_URL + self.SeasonId + self.BASE_URL_PART_3 + str(self.PageNumber) + self.BASE_URL_PART_5
        d = Download(url)
        if d.doRequest():
            # fail
            print 'ERROR: ' + self.SeasonId + '-' + str(self.PageNumber)
        else:
            utfstr2file(d.getSOURCE(), './data/' + self.SeasonId + '-' + str(self.PageNumber) + '.raw')

        return url
Exemplo n.º 17
0
    def request(self):
        baseUrl = "http://shaishufang.com/index.php/site/main/uid/"
        postFix = "/friend/false/category//status//type//page/"
        url = baseUrl + self.UID + postFix + str(self.Page)

        d = Download(url, self.Cookie, self.Proxy)
        if d.doRequest():
            return False

        self.HTML = d.getSOURCE()
        return True
Exemplo n.º 18
0
 def doRequest(self):
   playerId = str(self.PlayerId)
   seasonType = self.SeasonType.replace(" ", "+")
   url = self.Url + "PlayerId=" + playerId + "&SeasonType=" + seasonType + "&League=" + self.LeagueId
   d = Download(url)
   
   if d.doRequest() == 1:
     return 1
   
   self.recs = dumps(loads(d.getSOURCE()))
   return 0
Exemplo n.º 19
0
    def request(self):
        baseUrl = 'http://shaishufang.com/index.php/site/detail/uid/'
        postFix = '/status//category/none/friend/false'
        url = baseUrl + self.UID + '/ubid/' + self.BID + postFix

        d = Download(url, self.Cookie, self.Proxy)
        if d.doRequest():
            return False

        self.HTML = d.getSOURCE()
        return True
Exemplo n.º 20
0
 def file_exists(self, file_path):
     hash_local = self.hash_file(file_path)
     download = Download(
         ("https://commons.wikimedia.org/w/api.php?action=query&list"
          "=allimages&format=json&aisha1=") + hash_local, as_var=True)
     if(download.perform()):
         content = download.get_result().getvalue()
         json_data = json.loads(content)
         if(len(json_data["query"]["allimages"]) > 0):
             return True
         else:
             return False
Exemplo n.º 21
0
    def run(self):
        url = self.BASE_URL + self.SeasonId + self.BASE_URL_PART_3 + str(
            self.PageNumber) + self.BASE_URL_PART_5
        d = Download(url)
        if d.doRequest():
            # fail
            print 'ERROR: ' + self.SeasonId + '-' + str(self.PageNumber)
        else:
            utfstr2file(
                d.getSOURCE(), './data/' + self.SeasonId + '-' +
                str(self.PageNumber) + '.raw')

        return url
Exemplo n.º 22
0
  def getStats(self):
    d = Download(self.API)
    if d.doRequest():
      return False

    res = []
    j = loads(d.getSOURCE())
    for item in j['resultSets'][1]['rowSet']:
      res.append(item[1:])

    if len(res) == 0:
      return False
    else:
      return res
Exemplo n.º 23
0
  def run(self):
    while True:
      print 'INFO: ........................................ START'
      stats = self.dbm.getStats()
      print 'INFO: deadLinks-', stats[0], ' unvisitedLinks-', stats[1], ' visitedLinks-', stats[2]
      # get an url from unvisitedLinks
      url = self.dbm.retrieveUnvisitedLink()
      if url == False:
        print 'DEBUG: DONE -- retrieveUnvisitedLink return False'
        break

      print 'DEBUG: Processing ', url

      if not self.urlFilter.isPlainText(url):
        print 'DEBUG: NotPlainTextURL ', url
        continue
      
      if not self.domainFilter.isInDomain(url):
        print 'DEBUG: NOT IN DOMAIN ', url
        continue

      # requet the url
      d = Download(url)
      if d.doRequest() == 1:
        if not self.dbm.createDeadLink(url):
          print 'DEBUG: deadLinks already contain ', url
        else:
          print 'DEBUG: Add To deadLinks ', url
      else:
        if self.dbm.createVisitedLink(url):
          print 'DEBUG: Add To visitedLinks ', url
        else:
          print 'DEBUG: Failed Add To visitedLinks ', url

        # extract urls from the sourc2
        u = URLExtractor(d.getSOURCE(), url)
        tmpUrls = u.getUrls()
        if tmpUrls:
          for url in tmpUrls:
            if self.dbm.isInDeadLink(url):
              continue
            elif self.dbm.isInVisitedLink(url):
              continue
            elif self.dbm.isInUnvisitedLink(url):
              continue
            else:
              print 'DEBUG: Add To unvisitedLink ', url
              self.dbm.createUnvisitedLink(url)
    
      print 'INFO: ........................................ END'
Exemplo n.º 24
0
class DatasetManagement:
    def __init__(self):
        """
        Classe per gestire la copia dei dati da una forma
        /dataset/class1/
        /dataset/class1/test
        To
        /train/class1
        test/class2
        """
        #variabile dove si trova il dataset
        self.data_dir = "/Users/Eric/Desktop/eric/Programmazione/python/DeepLearning/data/knifey-spoony"
        #Url dove posso scaricare un dataset
        self.data_url = "https://github.com/Hvass-Labs/knifey-spoony/raw/master/knifey-spoony.tar.gz"
        #path della cartella di train
        self.train_dir = os.path.join(self.data_dir, "train/")
        #path della cartella di test
        self.test_dir = os.path.join(self.data_dir, "test/")
        #dimensione immagine
        self.image_size = 200
        #canali immagine
        self.num_channels = 3
        self.img_shape = [self.image_size, self.image_size, self.num_channels]
        self.img_size_flat = self.image_size * self.image_size * self.num_channels
        #numero di classi del dataset
        self.num_classes = 3
        self.download = Download()

    def load(self):
        pass

    def execute(self):

        #gestione caricamento dataset da internet o da locale

        #scarica il dataset da internet se non è presente
        self.download.maybe_downlaod_and_extract(url=self.data_url,
                                                 download_dir=self.data_dir)

        # crea l'istanza del dataset
        cache_path = os.path.join(self.data_dir, "knifey-spoony.pkl")

        self.dataset = load_cached(cache_path=cache_path, in_dir=self.data_dir)

        #divide i dati in test e train secondo le classi pronti per essere processati

        self.dataset.copy_files(train_dir=self.train_dir,
                                test_dir=self.test_dir)
Exemplo n.º 25
0
def main():
	# print(sys.argv[0])
	# print(type(sys.argv[1]))
	print('\t\t###########\n\t\t# WELCOME #\n\t\t###########\n')
	lat1 = float(sys.argv[1])
	lon1 = float(sys.argv[2])
	lat2 = float(sys.argv[3])
	lon2 = float(sys.argv[4])
	print('\tStart coordinate (%f, %f)' % (lat1, lon1))
	print('\tEnd coordinate (%f, %f)' % (lat2, lon2))
	print('\tStart searching ...\n')
	sc = Search(lat1, lat2, lon1, lon2)
	sc.searchLevels()
	picl = sc.qkll[-1]
	lod = len(sc.qkll)
	print('\tSearching complete ... \n')
	dl = Download()
	tl = list()
	print('\tDownloading images ...\n')
	if not os.path.exists('./temp/') :
			os.makedirs('./temp/')

	for qk in picl:
		dl.getUrlImage(qk)
		tl.append(Tile(qk))
	print('\tDownloading complete ...\n')
	ts = TileSystem()
	
	pX1, pY1 = ts.latLongToPixelXY(sc.MinLatitude, sc.MinLongitude, lod)
	
	pX2, pY2 = ts.latLongToPixelXY(sc.MaxLatitude, sc.MaxLongitude, lod)
	print('\tStart merging ...\n')
	mg = Imerge(pX1, pX2, pY1, pY2, lod)
	for t in tl:
		mg.fillIm(t)
	print('\tMerging complete ...\n')
	fname = input('\tPlease give a name to the Image.\n\t\t')
	mg.saveFig(fname)
	f = open(fname, 'w')
	f.write('Start coordinate\n \t(%f, %f)\nEnd coordinate\n \t(%f, %f)' % (lat1, lon1, lat2, lon2))
	
	if 'y' == input('\tRemove caches? y?\n\t\t') :
		filelist = [ f for f in os.listdir('./temp/') ]
		for f in filelist:
   			 os.remove(os.path.join('./temp/', f))
		

	print('\t\t##########\n\t\t#  DONE  #\n\t\t##########\n')
Exemplo n.º 26
0
def worker(appids, isbns, appidsCycle):
    # appidsCycle = cycle(appids)

    for isbn in isbns:
        url = 'http://' + appidsCycle.next() + '.appspot.com/url?url=' + 'http://book.douban.com/isbn/' + str(isbn)
        # print 'DEBUG: ', url

        d = Download(url)
        if d.doRequest():
            print isbn, 'network error'
            continue

        j = json.loads(d.getSOURCE())
        print isbn, j['status_code']

    return
Exemplo n.º 27
0
Arquivo: TopIp.py Projeto: csrgxtu/gps
def main():
    ipList = getIps(config.IP_FILE)
    top = {}
    for ip in ipList:
        print ip
        start = int(round(time.time()))
        obj = Download("http://" + ip)
        if not obj.doRequest():
            end = int(round(time.time()))
            top[ip] = end - start
    tmp = [(v, k) for k, v in top.iteritems()]
    topList = []
    for item in sorted(tmp):
        topList.append(item[1])

    lst2File(topList, config.TOP_IP_FILE)
Exemplo n.º 28
0
    def run(self, processName='MainProcess'):
        for isbn in self.ISBNS:
            url = 'http://www.amazon.cn/s/ref=nb_sb_noss?field-keywords=' + isbn
            d = Download(url)
            if d.doRequest():
                print 'ERROR[' + processName + ']: ', isbn, 'NERR'
                appendstr2file(isbn, './NERR.txt')
                continue

            asin = ASINParser(d.getSOURCE())
            if asin.getAsin():
                print 'INFO[' + processName + ']: ', isbn, asin.getAsin()
                appendstr2file(isbn + ',' + asin.getAsin(), './OK.txt')
            else:
                print 'WARN[' + processName + ']: ', isbn, 'NOER'
                appendstr2file(isbn, './NOER.txt')
Exemplo n.º 29
0
 def find_urls(self):
     """
     Finds the Download urls with different qualities and save them.
     """
     down = Download(self.url, as_var=True)
     if(down.perform()):
         result = down.get_result()
         soup = BeautifulSoup(result.getvalue())
         download_links = soup.find_all("a", {"class": "DownloadLink"})
         if(download_links):
             self.lq_url = download_links[0]["href"]
             self.hq_url = download_links[1]["href"]
         raw_link = soup.find(
             text="Other options available:").find_next("script").text
         m = re.search(r"href=..(.*\.\b[a-zA-Z0-9]+\b)", raw_link)
         if(m):
             self.orig_url = self.url_base + "/" + m.group(1)
Exemplo n.º 30
0
    def __init__(self, lat1, lat2, lon1, lon2) :
        self.ts = TileSystem()
        self.dl = Download()
        self.qkll = list()

        if lat1 > lat2 :
            self.MinLatitude = lat1
            self.MaxLatitude = lat2
        else :
            self.MinLatitude = lat2
            self.MaxLatitude = lat1

        if lon1 < lon2 :
            self.MinLongitude = lon1
            self.MaxLongitude = lon2
        else :
            self.MinLongitude = lon2
            self.MaxLongitude = lon1
Exemplo n.º 31
0
def Google_Web_Search_Helper(q, hl='en', start=0):
  Google_Web_Search_URL = 'https://www.google.com/search?'

  if not q:
    return {}
  else:
    Google_Web_Search_URL = Google_Web_Search_URL + 'q=' + q

  Google_Web_Search_URL = Google_Web_Search_URL + '&hl=' + hl
  Google_Web_Search_URL = Google_Web_Search_URL + '&start=' + start

  d = Download(Google_Web_Search_URL)
  if d.doRequest():
    return {}
  else:
    g = GoogleSearchResultParser(d.getSOURCE())
    return g.getJson()
  """
    def run(self, processName='MainProcess'):
        for asin in self.ASINS:
            url = 'http://www.amazon.cn/dp/' + asin
            d = Download(url)
            if d.doRequest():
                print 'ERROR[' + processName + ']: ', asin, 'NERR'
                appendstr2file(asin, './NERRBasicInfo.txt')
                continue

            b = BasicInfoParser(d.getSOURCE())
            jsonRes = b.basicInfo()

            if json.loads(jsonRes):
                print 'info[' + processName + ']: ', asin
                appendstr2file(jsonRes, './OKBasicInfo.txt')
            else:
                print 'WARN[' + processName + ']: ', asin, 'NOER'
                appendstr2file(asin, './NOERBasicInfo.txt')
Exemplo n.º 33
0
def Save(email):
    """ Saves the youtube videos and handles errors"""
    try:
        Download(email)
    except Exception as e:
        SendEmail(
            email['from'], email['subject'],
            'Something went wrong while downloading the ' + email['type'] +
            ' file: ' + email['url'] + '\n\nThe error was: ' + str(e))
        return False
Exemplo n.º 34
0
 def parse_web(self):
     down = Download(self.url, as_var=True)
     if(down.perform()):
         result = down.get_result()
         soup = BeautifulSoup(result.getvalue())
         mission_table = soup.find(
             text="Missions used in the Database").find_next("table")
         mission_params = mission_table.find("tbody").find_all("tr")
         for m in mission_params:
             mission_as_list = list(m.children)
             if(len(mission_as_list) > 5):
                 self.db.insert_mission(mission_as_list[0].text,
                                        mission_as_list[1].text,
                                        mission_as_list[2].text,
                                        self.parse_date(
                                            mission_as_list[3].text),
                                        self.parse_date(
                                            mission_as_list[4].text),
                                        mission_as_list[5].text)
Exemplo n.º 35
0
  def walker(self):
    while True:
      urls = self.dbm.retrieveUnvisitedLinks(0, 100)
      urls = self.urlFilter.getFilteredUrls(urls)
      if len(urls) == 0:
        break

      for url in urls:
        print 'INFO: Processing ', url
        d = Download(url)
        if d.doRequest() == 1:
          self.dbm.createDeadLink(url)
        else:
          self.dbm.createVisitedLink(url)
          u = URLExtractor(d.getSOURCE(), url)
          tmpUrls = u.getUrls()
          if tmpUrls:
            self.dbm.createUnvisitedLinks(list(set(tmpUrls)))

    return True
Exemplo n.º 36
0
class Search:
    def __init__(self, lat1, lat2, lon1, lon2) :
        self.ts = TileSystem()
        self.dl = Download()
        self.qkll = list()

        if lat1 > lat2 :
            self.MinLatitude = lat1
            self.MaxLatitude = lat2
        else :
            self.MinLatitude = lat2
            self.MaxLatitude = lat1

        if lon1 < lon2 :
            self.MinLongitude = lon1
            self.MaxLongitude = lon2
        else :
            self.MinLongitude = lon2
            self.MaxLongitude = lon1

    def getTileXY(self, lat, lon, levelOfDetail) :
        
        pX, pY = self.ts.latLongToPixelXY(lat, lon, levelOfDetail)
        tX, tY = self.ts.pixelXYToTileXY(pX, pY)
        return tX, tY

    def search1Level(self, levelOfDetail) :
        tX1, tY1 = self.getTileXY(self.MinLatitude, self.MinLongitude, levelOfDetail)
        tX2, tY2 = self.getTileXY(self.MaxLatitude, self.MaxLongitude, levelOfDetail)
        print('\tStart tileXY (%d, %d)' % (tX1, tY1))
        print('\tStart tileXY (%d, %d)' % (tX2, tY2))
        re = list()

        for i in range(tY1, tY2 + 1) :
            for j in range(tX1, tX2 + 1) :
                qk = self.ts.tileXYToQuadKey(j, i, levelOfDetail)
                if self.dl.getUrlResponse(qk) :
                    re.append(qk)
                else :
                    return None

        return re

    def searchLevels(self) :
        lod = 1
        ql = self.search1Level(lod)
        while ql:
            self.qkll.append(ql)
            lod += 1
            ql = self.search1Level(lod)
Exemplo n.º 37
0
    def getStats(self):
        d = Download(self.API)
        if d.doRequest():
            return False

        res = []
        j = loads(d.getSOURCE())
        for item in j['resultSets'][0]['rowSet']:
            tmp = []
            name = item[3]
            pos = item[5]
            if item[6] == 'null':
                height = 'None'
            else:
                height = item[6]
            if item[7] == " ":
                weight = 'None'
            else:
                weight = item[7]
            age = item[9]
            if item[10] == 'R' or item[10] == 'None' or item[10] == None:
                exp = 0
            else:
                exp = item[10]

            tmp.append(name)
            tmp.append(pos)
            tmp.append(height)
            tmp.append(weight)
            tmp.append(age)
            tmp.append(exp)
            res.append(tmp)

        if len(res) == 0:
            return False
        else:
            return res
Exemplo n.º 38
0
  def getStats(self):
    d = Download(self.API)
    if d.doRequest():
      return False

    res = []
    j = loads(d.getSOURCE())
    for item in j['resultSets'][0]['rowSet']:
      tmp = []
      name = item[3]
      pos = item[5]
      if item[6] == 'null':
        height = 'None'
      else:
        height = item[6]
      if item[7] == " ":
        weight = 'None'
      else:
        weight = item[7]
      age = item[9]
      if item[10] == 'R' or item[10] == 'None' or item[10] == None:
        exp = 0
      else:
        exp = item[10]

      tmp.append(name)
      tmp.append(pos)
      tmp.append(height)
      tmp.append(weight)
      tmp.append(age)
      tmp.append(exp)
      res.append(tmp)

    if len(res) == 0:
      return False
    else:
      return res
Exemplo n.º 39
0
    def process_download_file(self, file_to_process):
        with open(
                "ProcessingFiles" + self.directory_separator + file_to_process,
                "r") as download_file:
            try:
                download_data = json.loads(download_file.read())
                for f in sorted(download_data["Files"]):
                    self.map_download_directories(
                        f.replace(self.remote_directory_to_sync + "/", ""))

                for f in download_data["Files"]:
                    for file_to_download in download_data["Files"][f]:
                        Download(self.ftp_sync, f, file_to_download)

            except Exception as e:
                Logger("Error - Unable to download file: " +
                       str(download_file) + ", " + str(e))
Exemplo n.º 40
0
    def __init__(self, user, host, no_notify, verbose, interval, workflow_id=None):
        self.host = host
        self.user = user
        self.interval = interval
        self.cromwell = Cromwell(host=host)
        self.messenger = Messenger(self.user)
        self.no_notify = no_notify
        self.verbose = verbose
        self.workflow_id = workflow_id
        if user == "*":
            self.event_subscribers = [EmailNotification(self.cromwell),
                                        SystemTestDownload(), Download(self.cromwell.host), GATKDownload()]

            engine = create_engine("sqlite:///" + config.workflow_db)
            Base.metadata.bind = engine
            DBSession = sessionmaker()
            DBSession.bind = engine
            self.session = DBSession()
Exemplo n.º 41
0
 def parse_web(self):
     down = Download(self.url, as_var=True, post_dict=self.post_dict)
     found_start = False
     can_add = False
     if(down.perform()):
         web_string_etree = etree.fromstring(down.get_result().getvalue())
         for element in web_string_etree.iter("script"):
             redirect_url = element.text
         redirect_url_array = redirect_url.split("\"")
         down = Download(self.base_url + redirect_url_array[1], as_var=True)
         if(down.perform()):
             string_etree = html.fromstring(
                 down.get_result().getvalue())
             table = string_etree.xpath("//table[@id='QueryResults']")
             for element in table[0].iter("tr"):
                 list_of_elements = list(element.iter("td"))
                 if(len(list_of_elements) > 5):
                     a = list(list_of_elements[0].iter("a"))
                     if(found_start or self.no_need):
                         can_add = True
                     if(self.new_start):
                         if(self.new_start == a[0].text and not found_start):
                             found_start = True
                     if(can_add):
                         self.db.insert_image(a[0].attrib["href"],
                                              a[0].text,
                                              self.parse_date(
                             list_of_elements[1].text),
                             list_of_elements[2].text,
                             list_of_elements[3].text,
                             list_of_elements[4].text,
                             list_of_elements[5].text,
                             list_of_elements[6].text,
                             list_of_elements[7].text,
                             self.mission_id,
                             False, False)
                         self.db.update_mission_image_id(
                             self.mission_id, a[0].text)
             self.db.update_mission_image_id(
                 self.mission_id, str(0))
Exemplo n.º 42
0
	def __init__(self):
		Download.__init__(self)
		self.re_url = re.compile("^http\:\/\/(www\.)dailymotion\.com\/video\/([a-z0-9]+?)\_")
		self.urltype = VidType.DAILYMOTION
		self.h = {}
Exemplo n.º 43
0
 def __init__(self):
     Download.__init__(self)
     self.urltype = VidType.VIDDLER
     self.h = {}
Exemplo n.º 44
0
 def find_online_category(self, term):
     result = None
     down = Download(self.base_api + urllib.quote(term), as_var=True)
     if(down.perform()):
         result = down.get_result()
     return result
Exemplo n.º 45
0
#!/usr/bin/env python
#coding=utf-8
#
# Author: Archer Reilly
# Date: 11/Aug/2014
# File: PlayerInfoParserTest.py
# Description: test the PlayerInfoParser class
# Website: http://csrgxtu.blog.com/
#
# Produced By CSRGXTU
from PlayerInfoParser import PlayerInfoParser
from Download import Download

URL = "http://sports.qq.com/d/f_players/3/2890/"
player = Download(URL)
if player.doRequest() != 0:
    print "Download Cant Do Requst"
else:
    print "Successfully Do Request"

playerParser = PlayerInfoParser(player.getSOURCE())
Exemplo n.º 46
0
	def __init__(self):
		Download.__init__(self)
		self.re_id = re.compile('\:content\:atom\.com\:(.+?)\"')
		self.urltype = VidType.ATOM
		self.h = {}
Exemplo n.º 47
0
	def __init__(self):
		Download.__init__(self)
		self.re_fragment = re.compile("^(t\=)?((\d+)h)?((\d+)m)?((\d+)s)?$")
		self.urltype = VidType.GOOGLEVIDEO
		self.h = {}
Exemplo n.º 48
0
	def __init__(self):
		Download.__init__(self)
		self.re_id = re.compile("swfobject\.embedSWF\(\"(.*?)\"")
		self.re_title = re.compile("\<title\>MIT TechTV \&ndash\; (.*?)\<\/title\>")
		self.urltype = VidType.MITTECHTV
		self.h = {}
Exemplo n.º 49
0
# Website: http://csrgxtu.blog.com/
#
# Produced By CSRGXTU
import requests
from Download import Download
from Parser import Parser
from TeamInfoParser import TeamInfoParser

"""
page = requests.get('http://econpy.pythonanywhere.com/ex/001.html')
print page.text
parser = Parser(page.text)
#print parser.getBuyers()
"""
URL = "http://sports.qq.com/d/f_teams/1/42/"
soccer = Download(URL)
if soccer.doRequest() == 0:
  print "Successfully do request"
else:
  print "Failed do request"

html = soccer.getSOURCE()
parser = TeamInfoParser(html)
name = parser.getTeamName()
print "name:", unicode(name).encode('utf8')
name_cn = parser.getTeamNameCN()
print "name_cn:", unicode(name_cn).encode('utf8')
logo = parser.getTeamLogo()
print "logo:", logo
city = parser.getTeamCity()
print "city:", city
Exemplo n.º 50
0
class Spider:
    RE_EXPONEA = re.compile("^https?://[^/]*exponea.com(/.*)?$")
    downloaded = []
    visited = []

    def __init__(self, loop, url):
        self.__url = self.__remove_trailing_slash(url)
        self.__loop = loop
        self.__downloader = Download()
        self.__scraper = Scraper()

    def __gen_file_name(self, link):
        return link.replace("/", "_")

    def __remove_trailing_slash(self, url):
        if url[-1] == "/":
            return url[:-1]
        else:
            return url

    def __is_exponea(self, url):
        return Spider.RE_EXPONEA.match(url)

    # if i used selenium this would be easier (following links),
    # but i wanted to keep it simple, so here we go:
    # (i'd use selenium or some scraping framework next time)
    @asyncio.coroutine
    def _sanitize_url(self, url, filter_exponea=False):
        if len(url) > 500:
            print("Somethings probably wrong (loop?): " + url)  # or base64
            return None

        if url.startswith("http") and (
                not filter_exponea
                or self.__is_exponea(url)):  # refactor this bit to be generic
            return url

        if url.startswith("//") and (not filter_exponea
                                     or self.__is_exponea("http:" + url)):
            return "http:" + url

        if url.startswith("/"):
            return self.__url + url

        # todo there are some base64 encoded images rejected

        print("Sanitize: Rejected " + url)
        return None

    @asyncio.coroutine
    def run(self):
        print("Processing " + self.__url)
        Spider.visited.append(self.__url)
        # get images

        page = yield from self.__downloader.download_data_url(self.__url)
        if not page:
            return
        imglinks = yield from self.__scraper.get_image_links(page)
        for link in imglinks:
            sanitized = yield from self._sanitize_url(link)
            if sanitized and (sanitized not in Spider.downloaded):
                Spider.downloaded.append(sanitized)
                yield from self.__downloader.download_image_url(
                    sanitized, self.__gen_file_name(sanitized))

        # SPAWN!
        links = yield from self.__scraper.get_links(page)
        for link in links:
            sanitized = yield from self._sanitize_url(link,
                                                      filter_exponea=True)
            if sanitized and (sanitized not in Spider.visited):
                next = Spider(self.__loop, sanitized)
                yield from next.run()
Exemplo n.º 51
0
 def __init__(self, loop, url):
     self.__url = self.__remove_trailing_slash(url)
     self.__loop = loop
     self.__downloader = Download()
     self.__scraper = Scraper()
Exemplo n.º 52
0
 def __init__(self):
     Download.__init__(self)
     self.re_id = re.compile('\:content\:atom\.com\:(.+?)\"')
     self.urltype = VidType.ATOM
     self.h = {}
Exemplo n.º 53
0
def main():
    parser = argparse.ArgumentParser(description='Find all protein database entrys of specified taxon IDs and their descendants.' \
          ' One taxID or a taxID input file must be provided. Peptide-Databases from NCBI or Uniprot can be used. User defined databases,' \
          ' if header contain taxon IDs (e.g. OX=1111) or ncbi/uniprot accession IDs.')
    parser.add_argument(
        '-i',
        '--input',
        dest='input',
        default=None,
        help='TaxID input file: tabular file containing a column of NCBI'
        ' taxon IDs. Columns tab separated.')
    parser.add_argument('-c',
                        '--column',
                        dest='column',
                        type=positive_integer,
                        default=0,
                        help='The column (zero-based) in the tabular '
                        'file that contains Taxon IDs. Default = 0.')
    parser.add_argument(
        '-t',
        '--taxon',
        dest='taxon',
        type=positive_integer,
        nargs='+',
        action='append',
        help=
        'NCBI taxon ID/s for database extraction. Multiple taxonIDs seperated by space.'
    )
    parser.add_argument(
        '-d',
        '--database',
        dest='database',
        choices=['ncbi', 'uniprot', 'swissprot', 'trembl'],
        default='uniprot',
        help=
        'Database choice for analysis or for download. Choices: ncbi, uniprot, tremble, swissprot. '
        'No download, if databases with original name are stored in same folder as option --path '
    )
    parser.add_argument(
        '-p',
        '--path',
        dest='path',
        default=None,
        help='Path to folder with all needed '
        'databases: taxdump.tar.gz (for all databases), prot.accession2taxid or prot.accession2taxid.gz and '
        'pdb.accession2taxid.gz (for ncbi databases). Optional: peptide_database named: nr/nr.gz, '
        'uniprot_trembl.fasta/uniprot_trembl.fasta.gz or uniprot_sprot.fasta/uniprot_sprot.fasta.gz'
        ' or uniprot.fasta./uniprot.fasta.gz')
    parser.add_argument(
        '-o',
        '--out',
        dest='out',
        default=None,
        help=
        "File name and direction of the result taxon specified peptide database. "
        "Default = /taxon_specified_db_DATE/taxon_specific_database.fasta")
    parser.add_argument(
        '-n',
        '--dbname',
        dest='dbname',
        default=None,
        help=
        "Database name and direction. If database is in other folder than --path or name deviates from standard names."
    )
    parser.add_argument(
        '-l',
        '--level',
        dest='level',
        choices=[
            'species', 'section', 'genus', 'tribe', 'subfamily', 'family',
            'superfamily', 'order', 'superorder', 'class', 'phylum', 'kingdom',
            'superkingdom'
        ],
        default=None,
        help=
        'Hierarchy level up in anchestral tree. Choices: species, section, genus, tribe, '
        'subfamily, family, superfamily, order, superorder, class, phylum, kingdom, superkingdom'
    )
    parser.add_argument(
        '-z',
        '--no_descendants',
        dest='no_descendants',
        action='store_true',
        default=False,
        help=
        'Select peptide database only by given taxon IDs, descendant taxons are excluded.'
    )
    parser.add_argument(
        '-s',
        '--species',
        dest='species',
        action='store_true',
        default=False,
        help=
        'Select peptide database only until taxonomic level "species", descendents from species are excluded.'
    )
    parser.add_argument(
        '-r',
        '--non_redundant',
        dest='non_redundant',
        action='store_true',
        default=False,
        help=
        'Makes the final database non redundant in regard to sequences, headers are concatenated.'
    )
    parser.add_argument(
        '-u',
        '--threads',
        dest='threads',
        type=positive_integer,
        action="store",
        help=
        'Number of threads for using multiprocessing. Default = number of cores.'
    )
    parser.add_argument(
        '-x',
        '--reduce_header',
        dest='reduce_header',
        action='store_true',
        default=False,
        help=
        'Reduce the long headers of NCBI entries to accession IDs. Use only for NCBI databases.'
    )
    parser.add_argument('--version',
                        action='version',
                        version=('version ' + __version__))
    parser.add_argument(
        '-v',
        '--verbose',
        dest='verbose',
        action='store_true',
        default=False,
        help=
        'Verbose shows details about program progress and more information.')

    options = parser.parse_args()
    # url adresses for download:
    url_protaccession2taxID = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz'
    url_protaccession2taxID_md5 = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz.md5'
    url_pdbaccession2taxID = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz'
    url_pdbaccession2taxID_md5 = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz.md5'
    url_taxdump = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
    url_taxdump_md5 = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5'
    url_database_ncbi = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz'
    url_database_md5_ncbi = 'ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5'
    url_database_swissprot = 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz'
    url_database_trembl = 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz'
    url_uniprot_metadata = 'ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/RELEASE.metalink'
    db_dict_name = {
        'ncbi': url_database_ncbi.split('/')[-1],
        'uniprot': 'uniprot.fasta.gz',
        'swissprot': url_database_swissprot.split('/')[-1],
        'trembl': url_database_trembl.split('/')[-1]
    }

    # if not option out, a new folder with name taxon_database and date for result database and log file is created
    if options.out:
        output_path = Path.cwd() / options.out
    else:
        output_path = Output.createDir(Path.cwd())

    logger = initialize_logger(output_path, options.verbose)

    for arg, value in sorted(vars(options).items()):
        logger.debug("Argument %s: %r", arg, value)
    logger.debug("Result database and log file are saved in direction %s" %
                 output_path)

    # set path_to_db and database_folder for all user input variants

    # if options.path specified: folder to all databases (can be without protein DB if options.dbname)
    # if not exist, create folder with user defined name in option --path
    skip_check = False
    if options.path:
        database_folder = Path.cwd() / options.path
        path_to_db = database_folder / db_dict_name[options.database]

    # try open config file and read path to database folder, if no path option is entered
    # no config file, new database folder created
    else:
        try:
            path_to_main = Path(__file__, '..').resolve()
            with open(str(path_to_main) + "/tax2proteome.config",
                      'r') as config:
                database_folder = Path(config.readline().strip())
                path_to_db = database_folder / db_dict_name[options.database]
        except FileNotFoundError:
            database_folder = Path.cwd() / ('databases_' + str(date.today()))
            path_to_db = database_folder / db_dict_name[options.database]
            try:
                database_folder.mkdir()
                prot_gz_b = prot_b = pdb_b = taxdump_b = db_gz_b = db_b = False
                skip_check = True
                logger.info("Downloaded databases are saved in direction %s" %
                            database_folder)
            except FileExistsError:
                logger.debug(
                    "Database folder %s already exists. Checking for content."
                    % database_folder)
            except OSError:
                logger.exception(
                    "No permission to create new database folder.",
                    exc_info=True)
                exit(1)
    if not database_folder.exists():
        try:
            database_folder.mkdir()
            logger.info(
                "New folder %s created. All needed database files will be downloaded and stored in this "
                "direction." % database_folder)
            prot_gz_b = prot_b = pdb_b = taxdump_b = db_gz_b = db_b = False
            skip_check = True
        except OSError:
            logger.exception(
                "Database folder %s does not exist and can not be created." %
                database_folder,
                exc_info=True)
            exit(1)
    # user given path to database
    # given path to database checked, if not exists quit. Check if DB is in uniprot or ncbi format
    if options.dbname:
        path_to_db = Path.cwd() / options.dbname
        db_b = Output.check_files_exist([path_to_db])[0]
        if not db_b:
            logger.error(
                "Given database %s does not exist. Enter correct path under option --dbname. Program quits."
                % path_to_db)
            exit(1)
        if not TestFile.test_uniprot(options.dbname):
            options.database = 'ncbi'

    # check database folder for content
    # check if all needed files in database folder: bool values _b: True = file exists and not downloaded again
    if not skip_check:
        taxdump_b, prot_gz_b, prot_b, pdb_b, db_gz_b, db_b = Output.check_files_exist(
            [
                database_folder / url_taxdump.split('/')[-1],
                database_folder / url_protaccession2taxID.split('/')[-1],
                database_folder / 'prot.accession2taxid',
                database_folder / url_pdbaccession2taxID.split('/')[-1],
                path_to_db, path_to_db.parents[0] / path_to_db.stem
            ])
        if db_b:
            path_to_db = path_to_db.parents[0] / path_to_db.stem
        if not taxdump_b:
            logger.warning(
                "File taxdump.tar.gz does not exist does not exist under the path %s and will be downloaded."
                % str(database_folder))
        if not pdb_b and options.database == 'ncbi':
            logger.warning(
                "File pdb.accession2taxid.gz does not exist does not exist under the path %s and will be"
                " downloaded." % str(database_folder))
        if not prot_gz_b and not prot_b and options.database == 'ncbi':
            logger.warning(
                "File prot.accession2taxid.gz does not exist does not exist under the path %s and will be"
                " downloaded." % str(database_folder))
        if options.dbname is None and not db_b and not db_gz_b:
            logger.warning(
                "Database file %s does not exist does not exist under the path %s and will be downloaded."
                % (db_dict_name[options.database], str(database_folder)))

    # download taxdump file (best at the same day)
    if not taxdump_b:
        taxdump_md5 = read_ncbi_hash(url_taxdump_md5, logger)
        dwl_taxdb = Download(url_taxdump,
                             database_folder / url_taxdump.split('/')[-1],
                             taxdump_md5)
        dwl_taxdb.download()
        logger.debug('End download of taxdump.tar.gz')
    # download prot.accession2taxid.gz (only for ncbi) and check md5 hash
    if not prot_gz_b and not prot_b and options.database == 'ncbi':
        md5_hash = read_ncbi_hash(url_protaccession2taxID_md5, logger)
        dwl_protaccession = Download(url_protaccession2taxID,
                                     database_folder /
                                     url_protaccession2taxID.split('/')[-1],
                                     md5=md5_hash)
        dwl_protaccession.download()
        logger.debug(
            'End download from %s to location %s.' %
            (url_protaccession2taxID,
             str(database_folder / url_protaccession2taxID.split('/')[-1])))
    # download pdb.accession2taxid.gz (only for ncbi) and check md5 hash
    if not pdb_b and options.database == 'ncbi':
        md5_hash = read_ncbi_hash(url_pdbaccession2taxID_md5, logger)
        dwl_pdbaccession = Download(url_pdbaccession2taxID,
                                    database_folder /
                                    url_pdbaccession2taxID.split('/')[-1],
                                    md5=md5_hash)
        dwl_pdbaccession.download()
        logger.debug(
            'End download from %s to location %s.' %
            (url_pdbaccession2taxID,
             str(database_folder / url_pdbaccession2taxID.split('/')[-1])))
    # download peptide database and check md5 hash
    if not db_b and not db_gz_b:
        if options.database == 'ncbi':
            database_version_ncbi = 'ncbi ' + str(date)
            md5_hash = read_ncbi_hash(url_database_md5_ncbi, logger)
            dwl_db = Download(url_database_ncbi,
                              database_folder / db_dict_name['ncbi'],
                              md5=md5_hash)
            dwl_db.download()
            logger.debug("Databaseversion: %s" % database_version_ncbi)
            path_to_db = database_folder / db_dict_name['ncbi']
        else:
            if options.database == 'swissprot' or options.database == 'uniprot':
                database_version_swissprot, hash_swissprot = read_uniprot_metadata(
                    url_uniprot_metadata, db_dict_name['swissprot'], logger)
                logger.debug("Database version swissprot: %s " %
                             database_version_swissprot)
                dwl_db_swiss = Download(url_database_swissprot,
                                        database_folder /
                                        db_dict_name['swissprot'],
                                        md5=hash_swissprot)
                dwl_db_swiss.download()
                path_to_db = database_folder / db_dict_name['swissprot']
            if options.database == 'trembl' or options.database == 'uniprot':
                database_version_trembl, hash_trembl = read_uniprot_metadata(
                    url_uniprot_metadata, db_dict_name['trembl'], logger)
                logger.debug("Databaseversion trembl: %s." %
                             database_version_trembl)
                dwl_db_trembl = Download(url_database_trembl,
                                         database_folder /
                                         db_dict_name['trembl'],
                                         md5=hash_trembl)
                dwl_db_trembl.download()
                path_to_db = database_folder / db_dict_name['trembl']
            # concetenate  swissprot and trembl to uniprot file
            if options.database == 'uniprot':
                try:
                    logger.debug(
                        "Concatenate swissprot and trembl to uniprot database with name uniprot.fasta"
                    )
                    with open(str(database_folder / db_dict_name['trembl']),
                              'ab') as trembl:
                        with open(
                                str(database_folder /
                                    db_dict_name['swissprot']),
                                'rb') as swissprot:
                            shutil.copyfileobj(swissprot, trembl)
                    # rename trembl to uniprot:
                    Path(database_folder / db_dict_name['trembl']).rename(
                        database_folder / db_dict_name['uniprot'])
                    logger.debug("Uniprot database is now ready.")
                    path_to_db = database_folder / db_dict_name['uniprot']
                except FileNotFoundError:
                    logger.exception(
                        "Creation of uniprot database file out of trembl and swissprot file failed.",
                        exc_info=True)
                    exit(1)

    # create config file
    try:
        path_to_main = Path(__file__, '..').resolve()
        with open(str(path_to_main / "tax2proteome.config"), 'w') as config:
            config.write(str(database_folder) + '\n')
    except OSError:
        logger.debug('Can not create config file')

    # Read taxIDs from option -t and option -i
    if options.taxon:
        taxIDs = set(
            [taxID for taxonlist in options.taxon for taxID in taxonlist])
    else:
        taxIDs = set()
    if options.input:
        try:
            with open(options.input, 'r') as inputFile:
                for i, line in enumerate(inputFile):
                    fields = line.rstrip('\r\n').split('\t')
                    if len(fields) >= abs(options.column):
                        taxID = fields[options.column].strip()
                        if taxID.isdigit():
                            taxIDs.add(int(taxID))
                        else:
                            logger.error(
                                'Value %s in line %i of taxon input file is not a number. '
                                'Right column number specified?' % (taxID, i))
                            continue
                    else:
                        logger.error(
                            'Column number is bigger as number of columns in taxon ID input file. '
                            'Program continues without taxon IDs from input file.'
                        )
        except FileNotFoundError:
            logger.exception(
                'Taxon ID input file does not exist under specified path.',
                exc_info=True)

    if not taxIDs:
        logger.error(
            'No taxon ID given. Please check your input. Program quits. ')
        raise Exception('No taxon IDs.')
        exit(1)

    logger.debug('Given Tax-IDs: %s' % ' '.join(str(it) for it in taxIDs))

    # Try load pre-builded taxonomy graph or built taxonomy graph now
    if not (database_folder / 'taxon_graph').is_file():
        taxon_graph = TaxonGraph()
        logger.debug("Start building taxon graph.")
        taxon_graph.create_graph(database_folder / url_taxdump.split('/')[-1])
        logger.debug("Taxon graph successfully build.")
        # save TaxonGraph to harddrive:
        with open(str(database_folder / 'taxon_graph'), 'wb') as handle:
            pickle.dump(taxon_graph, handle, protocol=pickle.HIGHEST_PROTOCOL)
            logger.debug('Safe taxon graph to location: %s' %
                         str(database_folder / 'taxon_graph'))
    # load Taxon Graph
    else:
        try:
            logger.debug('Load taxon graph.')
            with open(str(database_folder / 'taxon_graph'), 'rb') as handle:
                taxon_graph = pickle.load(handle)
        except UnicodeDecodeError or EOFError:
            logger.exception(
                "Failed opening path to taxon graph / taxon_graph is corrupted. Delete %s file."
                % str(database_folder / 'taxon_graph'))
            exit(1)

    # adjusts the hierarchy level, if level does not exist, take next smaller level
    if options.level:
        logger.debug(
            "Start selection of next ancestor of level %s for all given taxIDs"
            % options.level)
        taxIDs = {
            taxon_graph.find_level_up(taxID, options.level)
            for taxID in taxIDs
        }
        logger.info(
            "All taxon IDs are set up to level %s in anchestral tree. Taxon IDs of level %s: %s"
            %
            (options.level, options.level, ' '.join(str(it) for it in taxIDs)))

    final_taxIDs = set()
    # find all descendants
    if not options.no_descendants:
        logger.debug("Start searching for all child taxon IDs.")
        for taxID in taxIDs:
            final_taxIDs.update(taxon_graph.find_taxIDs(
                taxID, options.species))
        logger.debug("End searching for all child taxon IDs.")
        logger.debug('Number of final taxon IDs: %s' % str(len(final_taxIDs)))
    else:
        final_taxIDs = taxIDs
        logger.debug('Number of taxon IDs for database search: %s' %
                     str(len(final_taxIDs)))

    # generate accession_taxID dict for ncbi db search and write custom specified db to --out
    with_taxon_ID = TestFile.test_uniprot(path_to_db)
    if not with_taxon_ID:
        accession = Accession(final_taxIDs)
        logger.debug('Read accession files.')
        if prot_b:
            accession.read_accessions(
                database_folder / 'prot.accession2taxid',
                database_folder / url_pdbaccession2taxID.split('/')[-1],
                options.threads)
        else:
            accession.read_accessions(
                database_folder / url_protaccession2taxID.split('/')[-1],
                database_folder / url_pdbaccession2taxID.split('/')[-1],
                options.threads)
        logger.debug('All accession IDs collected.')
        logger.info('Start writing taxon selected peptide database to %s.' %
                    output_path)
        wc = WriteCustomDB(path_to_db, output_path)
        wc.read_database(False,
                         gzipped=TestFile.test_gzipped(path_to_db),
                         accessions=accession.accessionIDs,
                         threads=options.threads)
        logger.debug('End writing taxon selected peptide database.')
        # non redundant database

    # uniprot: write custom specified db to --out
    else:
        logger.info('Start writing taxon selected peptide database to %s.' %
                    output_path)
        wc = WriteCustomDB(path_to_db, output_path, final_taxIDs)
        wc.read_database(True,
                         threads=options.threads,
                         gzipped=TestFile.test_gzipped(path_to_db))
        logger.debug('End writing taxon selected peptide database.')

    # non redundant database
    if options.non_redundant:
        DatabaseCleaner.non_redundant(output_path, with_taxon_ID)
        # remove redundant database:
        output_path.unlink()

    if options.reduce_header and not with_taxon_ID:
        # reduce headers of NCBI database
        DatabaseCleaner.reduce_header(output_path)
        output_path.unlink()

    logger.info('Program finished.')
    exit(0)
Exemplo n.º 54
0
 def doRequest(self, url):
   d = Download(url)
   if d.doRequest() == None:
     return None
   else:
     return d.getSOURCE()
Exemplo n.º 55
0
 def __init__(self):
     Download.__init__(self)
     self.h = {}
Exemplo n.º 56
0
	def __init__(self, phenny):
		Download.__init__(self)
		self.urltype = None	# 97 - TWITCHTV
		self.type = None
		self.consumerkey = phenny.config.justintvkey
		self.h = {}