def fetch_a_page(): url = pagefetchqueue.get() if url: page = Page() page.url = url fetcher.fetch(page) return None
def fetch_a_document(): url = documentfetchqueue.get() if url: page = Page() page.url = url fetcher.fetch(page) return None
def get_page(self,arch,source_version,target_version,path): if path == '' or path == '/': return self.ChangeSet(arch,source_version,target_version) out = '' diffable = is_diffable(path) if not diffable: return 'Sorry, this file is not diffable' arches = self.request.get('arches') if arches: oarch,narch = (int(_) for _ in arches.split('|')[:2]) else: oarch = version_arch(source_version) narch = version_arch(target_version) odata = fetcher.fetch(oarch,source_version,path) ndata = fetcher.fetch(narch,target_version,path) if odata is None: out += ('Sorry, could not fetch file %s for version %s.<br>' % \ (source_version,path)) elif ndata is None: out += ('Sorry, could not fetch file %s for version %s.<br>' % \ (target_version,path)) else: odata = odata.read() ndata = ndata.read() try: odata = odata.decode('utf8') except: odata = odata.decode('cp1251') try: ndata = ndata.decode('utf8') except: ndata = ndata.decode('cp1251') #return htmldiff.make_table(odata.splitlines(),ndata.splitlines()) out += ('<div class="unified_diff">') out += ("<pre>") diff = unified_diff(odata.splitlines(),ndata.splitlines(),fromfile=source_version,tofile=target_version) prevdiv = ' ' divs = { ' ' : '', '+' : 'add' , '-' : 'del' , '@' : 'linenum'} curdiv = '' for l in diff: line = escape(l) curdiv = line[0] if curdiv != prevdiv: if prevdiv != ' ': out += ('</div>') if curdiv != ' ': out += ('<div class="%s">' % divs[curdiv]) out += (escape(l)) out += ("\n") prevdiv = curdiv if curdiv != ' ': out += ('</div>') out += ("</div>") out += ("\n</pre>") return out
def import_manifest(arch,version): v = Version.get_by_key_name(version) if v is None or not v.imported: m = fetcher.fetch(arch,version,'manifest.xml') if m is not None: m = Manifest(m) #xg_on = db.create_transaction_options(xg=True) v = version_ok(arch,version) prev = db.GqlQuery('select * from Version where imported = True and arch = {0}'.format(arch)).fetch(1) if prev is not None and len(prev) > 0: prev = prev[0] from htmldiff import Changeset pmanifest = Manifest(fetcher.fetch(arch,prev.value,'manifest.xml')) changes = Changeset(pmanifest,m) to_delete = [ pmanifest.files[x] for x in changes.dels | changes.changes if pmanifest.files[x]['path'].endswith('entity') and pmanifest.files[x]['path'].startswith('game/resources0.s2z')] to_import = [ m.files[x] for x in changes.adds | changes.changes if m.files[x]['path'].endswith('entity') and m.files[x]['path'].startswith('game/resources0.s2z')] total = len(to_delete) current = 1 del(changes) del(m) del(pmanifest) for file in to_delete: e = Node.get_by_key_name('|'.join([file['version'],file['path']])) if e is not None: logging.info('[{1}/{2}] Deleting {0} entity group'.format('|'.join([file['version'],file['path']]),current,total)) db.run_in_transaction(delete_group,e) current += 1 del(to_delete) else: prev = None to_import = [x for x in m.files.values() if x['path'].endswith('entity') and x['path'].startswith('game/resources0.s2z')] total = len(to_import) current = 1 for file in to_import: if file['path'].endswith('.entity'): e = Node.get_by_key_name('|'.join([file['version'],file['path']])) if e is None: data = fetcher.fetch(arch,file['version'],file['path']) #if data is None: #continue logging.info('[%d/%d] importing %s %s into db' % (current,total,file['version'],file['path'])) db.run_in_transaction(parse_entity,data,file['version'],file['path'],[version]) #db.run_in_transaction_options(xg_on,parse_entity,file['version'],file['path'],[version]) #elif version not in e.versions: #db.run_in_transaction(set_version,e,version) current += 1 v.imported = True v.put() if prev is not None: prev.imported = False prev.put()
def ChangeSet(self,arch,source_version,target_version): out = '' arches = self.request.get('arches') if arches: oarch,narch = (int(_) for _ in arches.split('|')[:2]) else: oarch = version_arch(source_version) narch = version_arch(target_version) if oarch is None: oarch = arch if narch is None: narch = arch omanifest = fetcher.fetch(oarch,source_version,'manifest.xml') nmanifest = fetcher.fetch(narch,target_version,'manifest.xml') if omanifest is not None: #Version(value=source_version,key_name=source_version).put() version_ok(oarch,source_version) if nmanifest is not None: #Version(value=target_version,key_name=target_version).put() version_ok(narch,target_version) if omanifest is None: out += ('Sorry, could not fetch manifest for %s' % source_version) elif nmanifest is None: out += ('Sorry, could not fetch manifest for %s' % target_version) else: omanifest = Manifest(omanifest) nmanifest = Manifest(nmanifest) changeset = Changeset(omanifest,nmanifest) changes = ( {'path' : f, 'old_version' : omanifest.files[f]['version'], \ 'new_version' : nmanifest.files[f]['version'] } for f in changeset.changes ) adds = ( {'path' : f, \ 'new_version' : nmanifest.files[f]['version'] } for f in changeset.adds ) dels = ( {'path' : f, 'old_version' : omanifest.files[f]['version']\ } for f in changeset.dels ) template_values = { 'source_version': source_version, 'target_version': target_version, 'changes': changes, 'adds' : adds, 'dels' : dels, 'base_url' : fetcher.get_base_url(arch), 'base_url2' : fetcher.get_base_url(arch, 1), 'arches' : '|'.join([str(_) for _ in [oarch,narch]]), } template = templates.get_template('changeset.html') return template.render(template_values) self.response.out.write(out) return None
def test_fetcher_sets_url(mock_requests): """Test that the fetcher correctly sets the podcast's url.""" mock_requests.get(FEED_URL, text=read_test_file("canvas")) result = fetch(FEED_URL) assert result.url == FEED_URL
def get(artist, song, album=None): """Fetch the lyrics as text.""" info = artist, song, album or '' try: return database.load(*info) except LookupError: return fetcher.fetch(*info)
def extract(entrypoint): """Extract feed from entrypoint Entrypoint can be a full url or a domain name. When a domain name is given, it attempts to guess the homepage where it can extract the feed. """ fetched_data = fetch(entrypoint) if fetched_data is None: return None else: urls, html = fetched_data feeds = extract_feeds(html, urls[-1]) expanded_feeds = expand_feeds(feeds) results = filter_expanded_feeds(urls, expanded_feeds) # TODO loop toward domain for cases such as http://www.lesechos.fr/economie-politique/france/ # TODO Scoring based on sublinks? -> http://www.lefigaro.fr if results: # Temporary behavior # on multiple results: select the first one # TODO Implement better scoring del results[0]['sublinks'] results[0]['link'] = results[0]['link'][0] return results[0] return None
def test_fetcher_returns_parsed_feed(mock_requests): """Test that the fetcher correctly fetches and parses a podcast feed.""" mock_requests.get(FEED_URL, text=read_test_file("canvas")) result = fetch(FEED_URL) assert result.title == "Canvas"
def main(): fetcher.fetch() filenames = os.listdir(os.path.join(".", DIRECTORY)) f2l_dict = dict() for filename in filenames: if filename.split(os.extsep)[-1] in ["htm", "html"]: # проверяем расширение файла print("Parsing file: " + filename) f = open(os.path.join(".", DIRECTORY, filename), "r", encoding="utf-8") parse_page(f.read(), f2l_dict) f.close() else: print("Wrong file type: " + filename) out = open("output.txt", "w", encoding="utf-8") out.write(str(f2l_dict)) out.close() print("Обработка завершена!")
def test_fetcher_sets_last_fetched(mock_requests): """Test that the fetcher correctly updates the podcast's last_fetched property.""" mock_requests.get(FEED_URL, text=read_test_file("canvas")) now = datetime.datetime.now() with freezegun.freeze_time(now): result = fetch(FEED_URL) assert result.last_fetched == now
def handle(job, *args, **kwargs): queue = kwargs['queue'] task = json.loads(job) url = task["url"] status, source = fetcher.fetch(url, use_proxy=False) logger.info('%s|%s' % (url, status)) try: _, source = encoding.html_to_unicode('', source) except Exception, e: print e
def get_stringtables_entities(arch,version): try: stringtable = memcache.get('stringtable|entities|{0}'.format(version)) except: flush_all() if stringtable is not None: return stringtable stringtable = {} stringtable_version = Manifest(fetcher.fetch(arch,version,'manifest.xml')).files['game/resources0.s2z/stringtables/entities_en.str']['version'] tabledata = fetcher.fetch(arch,stringtable_version,'game/resources0.s2z/stringtables/entities_en.str').read().decode('utf8') for line in tabledata.splitlines(): m = re_entry.match(line) if m: stringtable[m.group(1)] = m.group(2).strip() try: memcache.set('stringtable|entities|{0}'.format(version),stringtable) except: flush_all() return stringtable
def set_user(): connection = mdb.connect(host='localhost', user='******', passwd='', db='fbhack', use_unicode=True, charset='utf8') cursor = connection.cursor() data = ( int(request.form['facebook_id']), request.form['first_name'], request.form['last_name'], request.form['username'], request.form['phone'] ) cursor.execute("INSERT INTO users (facebook_id, first_name, last_name, username, phone) VALUES(%s, %s, %s, %s, %s);", data) connection.commit() fetch() return jsonify(result="OK")
def run(self): item = self.queue.get() while item != None: try: url = item['url'] key = item['key'] constraint = item['constraint'] data = fetch(url) if data == None: self.logger.info('Not fetched: %s because type != text/html', url) else: links = get_all_links(data, base = url) feeds = get_all_feeds(data, base = url) interesting = collect(links) if interesting: self.collection_mutex.acquire() if key not in self.collection: self.collection[key] = {'feeds':{}} if feeds: for feed in feeds: self.collection[key]['feeds'][feed['href']] = feed['type'] for service, accounts in interesting.items(): if service not in self.collection[key]: self.collection[key][service] = {} for a,u in accounts.items(): self.collection[key][service][a] = {'url': u, 'depth':constraint.depth} self.collection_mutex.release() for l in links: new_constraint = constraint.inherit(url, l) if new_constraint == None: continue self.mutex.acquire() if l not in self.visited_urls: self.queue.put({'url':l, 'key':key, 'constraint': new_constraint}) self.visited_urls.add(l) self.mutex.release() except HTTPError: self.logger.info('HTTPError exception on url: %s', url) self.queue.task_done() item = self.queue.get() self.queue.task_done() # task_done on None
def process_wave(wave): """ Gets the median intensities for a wavelength, and the file paths If no *good* data is found in first 6 hours of day at 15 minutes steps, then the value is replaced with NaN in the series. Good images are those that have a "quality" rating of 0 At the end, all NaNs are filled with the last known value until then Unkown values in the beginning are filled from the next known value Args: wave (str): wave to process Returns: list containing the wave str, list of filenames, and intensities """ paths = pd.Series(index=date_list) raw = pd.Series(index=date_list) for date in datetime_list: fles = fetch(date, date + timedelta(minutes=1), wave) missing_data = False while no_images(fles): date += timedelta(minutes=15) fles = fetch(date, date + timedelta(minutes=1), wave) if date.hour >= 6: missing_data = True break # print(date) if not missing_data: index = [str(date.date())] fle = fles[0] med_int = process_med_int(fle) paths.loc[index] = fle raw.loc[index] = med_int paths = paths.ffill() # propagate missing values forwards paths = paths.bfill() # backwards. (if initial dates lack data) raw = raw.ffill() raw = raw.bfill() return [wave, paths, raw]
def _load_url(self, url_u, encoding=None): # word hit list obsolete self.wordhitview.clear_words() # set text in textview ret = fetcher.fetch(url_u) if not encoding: encoding = decoder.detect_encoding(ret.txt_byte) txt_u = decoder.decode(ret.txt_byte, encoding) txt_u = unmarkup.unwiki(txt_u) or unmarkup.unhtml(txt_u) self.text = word.Text() self.text.set_from_txt_u(txt_u) self.textview.set_text(self.text, encoding, url_u)
def get_page(self,arch,version): manifest = fetcher.fetch(arch,version,'manifest.xml') if manifest is not None: version_ok(arch,version) else: self.response.out.write("Sorry, could not fetch manifest for {0} version".format(version)) return manifest = Manifest(manifest) data = fetcher.fetch(arch,fetcher.normalize_ver(manifest.files['change_log_color.txt']['version']),'change_log_color.txt') if data is None: return data = data.read() try: data = data.decode('utf8') except: data = data.decode('cp1251') data = data.replace('\r\n','\n') data = hon2html(data) data = re.sub(ver_sub,r'\1[color=Yellow][SIZE=6][b]\2[/b][/size][/color]',data) data = re.sub(head_sub,r'\1[B]==[SIZE=4]\2[/size]==[/b]',data) data = re.sub(hr_sub,r'[hr][/hr]',data) smilies = fetch_honsmilies() data = re.sub(smilies[0],lambda m: '%s%s [b]%s[/b]' % (m.group(1), smilies[1][m.group(2).lower()], m.group(2)),data) return ''.join(['<pre>',data,'</pre>'])
def get_page(self,arch,version,path,fpath): manifest = fetcher.fetch(arch,version,'manifest.xml') if manifest is not None: #Version(value=version,key_name=version).put() version_ok(arch,version) else: self.response.out.write("Sorry, could not fetch manifest for {0} version".format(version)) return manifest = Manifest(manifest) path = path[1:] if fpath != '': print 'file requested!' else: if path == '': nodes = manifest.files.keys() else: l = len(path) nodes = [f[l:] for f in manifest.files.keys() if f.startswith(path)] dirs = [] files = [] for x in nodes: n = x.split('/') if len(n) == 1: x = path + x f = { 'path' : n[0] , 'version' : fetcher.normalize_ver(manifest.files[x]['version']), 'size' : manifest.files[x]['size'] } f['lang'] = get_lang(n[0]) f['fullpath'] = x files.append(f) else: dirs.append(n[0]) dirs = frozenset(dirs) if path != '': up_url = '..' else: up_url = '' template_values = { 'version': version, 'path': path, 'dirs': sorted(list(dirs)), 'files' : files, 'up_url' : up_url, 'base_url' : fetcher.get_base_url(arch), 'base_url2' : fetcher.get_base_url(arch, 1), } template = templates.get_template('folder.html') return template.render(template_values)
def handler(): if flask.request.method == "GET": return {"status": "ok"}, 200 else: if flask.request.json is None: app.logger.error("POST req without req body received") return {"status": "error"}, 500 else: try: ids = flask.request.json["ids"] tweets = fetcher.fetch(ids) app.logger.info("POST req with req body success") return {"status": "ok", "data": tweets}, 200 except Exception: app.logger.error("POST req with req body failed") return {"status": "error"}, 500
def url_handler(url_u, dir='/tmp/t'): if not os.path.isdir(dir): os.makedirs(dir) os.environ["ORIG_FILENAMES"] = "1" filename = os.path.join(dir, urlrewrite.url_to_filename(url_u)) + '.txt' ret = fetcher.fetch(url_u) txt_u = decoder.detect_decode(ret.txt_byte) txt_u = unmarkup.unwiki(txt_u) # add license notice tm = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) notice = u"\n\n%s\nRetrieved on %s from:\n %s" % ('-'*78, tm, ret.url_u) notice += (u"\nLicensed under CC-BY-SA, see %s" % "http://creativecommons.org/licenses/by-sa/3.0/") txt_u += notice txt_byte = decoder.encode(txt_u) open(filename, 'w').write(txt_byte)
def url_handler(url_u, dir='/tmp/t'): if not os.path.isdir(dir): os.makedirs(dir) os.environ["ORIG_FILENAMES"] = "1" filename = os.path.join(dir, urlrewrite.url_to_filename(url_u)) + '.txt' ret = fetcher.fetch(url_u) txt_u = decoder.detect_decode(ret.txt_byte) txt_u = unmarkup.unwiki(txt_u) # add license notice tm = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()) notice = u"\n\n%s\nRetrieved on %s from:\n %s" % ('-' * 78, tm, ret.url_u) notice += (u"\nLicensed under CC-BY-SA, see %s" % "http://creativecommons.org/licenses/by-sa/3.0/") txt_u += notice txt_byte = decoder.encode(txt_u) open(filename, 'w').write(txt_byte)
def get_page(self, arch, version, path): lang = self.request.query_string if lang == "": lang = "text" if not is_diffable(path): return "Sorry, viewing this types of files is not allowed" data = fetcher.fetch(arch, version, path) if data is None: self.response.out.write("Sorry, could not fetch file %s for version %s.<br>" % (version, path)) return None else: data = data.read() try: data = data.decode("utf8") except: data = data.decode("cp1251") template_values = {"data": pygmentize(lang, data)} template = get_template("highlight.html") return template.render(template_values)
def _refresh(self, url_prefix, item, stamps_dict, available): try: known_item = item in stamps_dict stamp = None if known_item and os.access(self._path('current', item), os.R_OK): stamp = stamps_dict[item] resp = fetch(url_prefix + item, stamp) local_path = self._path('new', item) if resp.status >= 400: # error statuses if known_item: del stamps_dict[item] return known_item # changed if previously known elif resp.status == 304: # not modified available.add(item) os.link( self._path('current', item), self._path('new', item)) stamps_dict[item] = resp.date return False # unchanged elif 200 <= resp.status < 300: # downloading available.add(item) resp.save(local_path) stamps_dict[item] = resp.date return True # changed else: raise Exception("Don't know what to do with response %s", resp.status) except: import traceback logging.error('Failed to fetch %s%s. Skipping. Exception info:\n%s', url_prefix, item, traceback.format_exc()) return False # assume unchanged
def fetch_action(): questions = fetcher.fetch(from_time=load_config()["last-sync"]) classified_questions = get_classifier().classify(questions) relation_store.add_list_of_questions(classified_questions)
def make_requests(requests): start = time.time() for ok, resp in fetch(requests, concurrent=100): print ok, resp delta = time.time() - start print '%.02f req/s' % (count / delta)
def test_save_start_time_to_config(config, current_time): with mock.patch("time.time", return_value=current_time): fetcher.fetch(sites=[], from_time=0) config.assert_called_with("last-sync", current_time)
def get_page(self,arch,version,hero): v = Version.get_by_key_name(version) if arch != fetcher.ARCHS.LINUX_RETAIL: return '<pre>Sorry, DB is disabled for RCT/SBT</pre>' elif v is None or not v.imported: versions = get_versions() versions.sort(key = lambda x: [int(y) for y in x.split('.')]) if version == versions[-1]: self.response.out.write("Sorry, this version is not imported into db yet, importing was put into queue") taskqueue.add(url='/import',params={'version' : version,'arch' : arch},queue_name='importer') else: self.redirect('/heroes/latest/?' + self.request.query_string) return None else: if hero is None: manifest = fetcher.fetch(arch,version,'manifest.xml') manifest = Manifest(manifest) query = "Select * from Node where tag='hero'".format(version) q = db.GqlQuery(query) result = q.fetch(1000) result = [_ for _ in result if _.name not in ['wl_Warlock']] for hero in result: if hasattr(hero,'attackprojectile') and hero.attackprojectile != '': projectile = db.GqlQuery("Select * from Node where name='{0}'".format(hero.attackprojectile)).fetch(1)[0] if hasattr(projectile,'speed'): hero.projectilespeed = projectile.speed else: hero.projectilespeed = '""' else: hero.projectilespeed = '""' #get url for icon icon = hero.icon.replace('.tga','.dds') path = '/'.join(hero.key().name().split('|')[1].split('/')[:-1]) path = '/'.join([path,icon]) path = path.replace('game/resources0.s2z','game/textures.s2z/00000000') if path in manifest.files: path = '/'.join([manifest.files[path]['version'],path]) else: logging.info("Failed to create url for hero icon :( :") logging.info(icon) logging.info(path) hero.iconurl = path template_values = {} template_values['data'] = result template_values['stringtables'] = get_stringtables_entities(arch,version) template_name = self.request.get('template') if template_name and template_name == 'csv': template = templates.get_template('heroes.csv') else: template = templates.get_template('heroes.html') #self.response.out.write(template.render(template_values)) #return None return template.render(template_values) else: hero = db.GqlQuery("Select * from Node where tag='hero' and name = :1",hero).fetch(1) if len(hero) == 0: return 'Sorry, such hero is not found' hero = hero[0] #get url for icon manifest = fetcher.fetch(arch,version,'manifest.xml') manifest = Manifest(manifest) icon = hero.icon.replace('.tga','.dds') path = '/'.join(hero.key().name().split('|')[1].split('/')[:-1]) path = '/'.join([path,icon]) path = path.replace('game/resources0.s2z','game/textures.s2z/00000000') path = '/'.join([manifest.files[path]['version'],path]) hero.iconurl = path abilities = db.GqlQuery("Select * from Node where tag='ability' and name in :1",[hero.inventory0,hero.inventory1,hero.inventory2,hero.inventory3]).fetch(10) for a in abilities: icon = a.icon.replace('.tga','.dds') path = '/'.join(a.key().name().split('|')[1].split('/')[:-1]) path = '/'.join([path,icon]) path = path.replace('game/resources0.s2z','game/textures.s2z/00000000') path = '/'.join([manifest.files[path]['version'],path]) a.iconurl = path #abilities = dict([(a.name,a) for a in abilities]) template_values = {} template_values['entity'] = hero template_values['version'] = version template_values['abilities'] = abilities template_values['stringtables'] = get_stringtables_entities(arch,version) template = templates.get_template('hero.html') return template.render(template_values)
def core(args): """ """ csv = os.path.expanduser(args.csv_file) fetcher.fetch(csv)
from fetcher import fetch from generator import generate from settings import CALENDAR_ID generate(fetch(CALENDAR_ID))
def get_page(self,arch,version,path): dds = fetcher.fetch(arch,version,path) return dds2png(dds)
print h.get_pos(), print import sys text = Text() text.set_from_file(sys.argv[1]) text.do_index() data = text.by_freq() print_by_freq(data) import fetcher url_u = u"http://www.dagbladet.no" text = Text() ret = fetcher.fetch(url_u) encoding = decoder.detect_encoding(ret.txt_byte) text.set_from_txt_byte(ret.txt_byte, encoding, untag=True) text.do_index() data = text.by_freq() print_by_freq(data) print (encoding) sys.exit() def out(dct, f): ws = dct.keys() ws = sorted(ws, cmp=lambda x, y: cmp(x.lower(), y.lower())) s = "" for w in ws: s += "%-6.6s %s\n" % (dct[w].len_hits(), w)
def test_happy_path(): url = 'http://www.google.com' data = fetcher.fetch(url) assert data assert len(data) > 0 assert 'google' in data.lower()
def test_invalid_feed(mock_requests): """Test that the fetcher raises an error if the url does not point to a valid rss feed.""" mock_requests.get(FEED_URL, text="<html><body><h1>Not A Feed!</h1></body></html>") with pytest.raises(InvalidFeed): fetch(FEED_URL)
def main(): # Part where argparser figures out your command parser = argparse.ArgumentParser( description='Parse NCBI and then work with Biological data') # Argument for clearing out the storage folder (Mostly for testing purposes) parser.add_argument('-d', '--delete', dest='delete', default=False, action='store_true', help="delete current storage") # Argument for fetching. parser.add_argument('-f', '--fetch', dest='fetch', default="", help='Fetches from ncbi and adds to storage: \n ' 'Usage: -f [Accession number or boolean operators]') parser.add_argument( '-i', '--index', dest='index', action='store_true', help= 'Resets the indexes. This can be done manually through this method or specified to do it' ' everytime from the configs.') parser.add_argument( '-m', '--mafft', dest='mafft', default=False, action='store_true', help= "Runs mafft when pulling. Optional alignment but requires -p or --pull to be effective. " "Can also be specified to run automatically in config") parser.add_argument( '-p', '--pull', dest='pull', default=False, action='store_true', help="Pull from storage. " "The genes and species specified are specified in genes.lst and species.lst." ) parser.add_argument( '-s', '--setup', dest='setup_structure', default="", help="Usage: -s [storage location]" + "\n" " Sets up a default structure for storage and indexes." "This should be done when moving storage to a location " "outside of the cloned folder.") # This stores all of the values from the parser args = parser.parse_args() delete = args.delete query_fetch = args.fetch index = args.index mafft_args = args.mafft pull = args.pull setup_structure = args.setup_structure # Testing output output = "Output: \n" # This is the part where we are reading from the config config = configparser.ConfigParser() config.read('ncbifetcher.config') email = config['OPTIONS']['email'] location_index = config['INDEX']['index_location'] location_storage = config['STORAGE']['storage_location'] location_output = config['OUTPUT']['output_location'] reset_indexes_default = config['OPTIONS']['reset_indexes_everytime'] run_mafft_config = config['OPTIONS']['run_mafft_everytime'] # Testing: Deletes everything in the folders and resets the indexes if delete: print("deleting... \n") delete_folder_contents(location_storage) delete_folder_contents(location_output) # Optional resetting indexes if reset_indexes_default == 1 or reset_indexes_default: reset_indexes(location_storage, location_index) return # Fetches from genbank if len(query_fetch) >= 1: # If the input is a file, fetches all from the file if os.path.isfile(query_fetch): print("Fetching from file: ", query_fetch) accession_numbers_from_file = [] lines = open(query_fetch, "r") for line in lines: # Gets every possible entry from file accession_numbers_from_file.append(line.strip().strip('\n')) accession_numbers_from_file = ','.join(accession_numbers_from_file) # Fetches based on the accession numbers fetch(accession_numbers_from_file, location_storage, email) else: # Fetches the single query print("Fetching...") fetch(query_fetch, location_storage, email) # Optional resetting indexes if reset_indexes_default == 1 or reset_indexes_default: reset_indexes(location_storage, location_index) return # This is a way to sort the indexes if index: print("Resetting indexes...") output += "Index: \n" reset_indexes(location_storage, location_index) return # Pulling from storage - Default set to wherever index says to go if pull: print("Pulling...") pull_query_to_fasta(location_output, location_index, location_storage, run_mafft=mafft_args or run_mafft_config == 1 or run_mafft_config == "true") return # For setting up a file structure at a location other than default if len(setup_structure) >= 1: print("Setting up structure at " + setup_structure + "...") ensure_folder_scheme_storage(setup_structure) return
'nid': nid, 'pid': pid, 'cover': cover, 'playlistId': playlistId, 'o_playlistId': o_playlistId, 'cid': cid, 'subcid': subcid, 'osubcid': osubcid, 'category': category, 'cateCode': cateCode, 'pianhua': pianhua, 'tag': tag, 'tvid': tvid, 'title': title, 'last': last, 'brief': brief } return item if __name__ == '__main__': import fetcher url = 'http://tv.sohu.com' url = 'http://tv.sohu.com/20131223/n392267093.shtml' url = 'http://tv.sohu.com/20131223/n392267093.shtml' status, content = fetcher.fetch(url) _, ucontent = encoding.html_to_unicode('', content) #print extract_links(url, ucontent) #print extract_content(url, ucontent) #print extract_sohutv(url, ucontent) print extract_sohutv_data_by_regex(url, ucontent)
print f, w.get().encode('utf-8'), for h in w.get_hits(): print h.get_pos(), print import sys text = Text() text.set_from_file(sys.argv[1]) text.do_index() data = text.by_freq() print_by_freq(data) import fetcher url_u = u'http://www.dagbladet.no' text = Text() ret = fetcher.fetch(url_u) encoding = decoder.detect_encoding(ret.txt_byte) text.set_from_txt_byte(ret.txt_byte, encoding, untag=True) text.do_index() data = text.by_freq() print_by_freq(data) print(encoding) sys.exit() def out(dct, f): ws = dct.keys() ws = sorted(ws, cmp=lambda x, y: cmp(x.lower(), y.lower())) s = '' for w in ws: s += '%-6.6s %s\n' % (dct[w].len_hits(), w)
filter_mediawiki = mediawiki.MediawikiFilter() txt_u = filter_mediawiki.get_wiki_body(txt_u) return txt_u def unwiki(txt_u): filter_mediawiki = mediawiki.MediawikiFilter() filter_html = html.HtmlFilter() txt_u = filter_mediawiki.get_wiki_body(txt_u) txt_u = filter_html.resolve_specialchars(txt_u) txt_u = filter_mediawiki.unmarkup(txt_u) txt_u = filter_html.unmarkup(txt_u) return txt_u if __name__ == "__main__": import decoder import fetcher ret = fetcher.fetch('http://en.wikipedia.org/w/index.php?title=Linguistics&action=edit') txt_u = decoder.detect_decode(ret.txt_byte) txt_u = unwiki(txt_u) or unhtml(txt_u) print(decoder.encode(txt_u)) sys.exit() txt_byte = open(sys.argv[1]).read() txt_u = decoder.detect_decode(txt_byte) txt_u = unwiki(txt_u) or unhtml(txt_u) print(decoder.encode(txt_u)) sys.exit()