def get_flair_entities(input, score_threshold=0.9): sentence = Sentence(input, use_tokenizer=True) model.predict(sentence) # refactor flair output entities = [] prev_end_pos = 0 prev_entity_part = '' for entity in sentence.to_dict(tag_type='ner')['entities']: if entity['labels'][0]._score < score_threshold: continue logger.info('flair entity detected: ' + str(entity)) if prev_end_pos + 1 == entity['start_pos']: del entities[-1] final_entity = prev_entity_part + ' ' + entity['text'] else: final_entity = entity['text'] entities.append(final_entity.strip()) prev_end_pos = entity['end_pos'] prev_entity_part += ' ' + entity['text'] return entities
async def fetch_all(urls, loop): async with aiohttp.ClientSession(loop=loop) as session: logger.info('Gathering async requests') results = await asyncio.gather( *[fetch(session, url, proxy) for url in urls], return_exceptions=True) return results
def SvcDoRun(self): logger.info("[WIN SERVICE]start service ddns....") self._ddns_Loader.start() while self.isAlive: # time.sleep(1) # 等待服务被停止 win32event.WaitForSingleObject(self.hWaitStop, win32event.INFINITE)
def get_google_search_result_count(person_name, exact_match, proxies, country_code): """ Returns either bool False or int as number of search results found. False will trigger the decorator to run this exact function again with a hope to get the function to return an int. Major problem with it is that the results are chaotic and not reproducible because google returns different structures depending on the random proxy though which the requests is being sent. """ response = get_google_search_response(person_name, exact_match, proxies, country_code) # parse the results and extract the part of html # which contains the number of search results found soup = BeautifulSoup(response.text, 'html.parser') results_div = soup.find('div', attrs={'id': ['resultStats', 'slim_appbar']}) # TODO: what if the results are in fact 0? e.g. sosicc cequel tycoonkingz # have to find a way to separate this case from other cases where we return False. # It should look something like this: if X (no results found) is in soup: return 0 if not results_div or not results_div.__getattribute__('text'): logger.info( 'Google search does not contain the search results div / or there are 0 results' ) return False def _parse_number_of_results(results_div_text): """ parse total number of search results given the text inside the div that should hold the number. Basically the function finds numbers in the string and returns the first number found as int """ # clean the string by removing space and commas # what we want is glued string from which we can # easily extract the first number found results_div_text = unidecode.unidecode(results_div_text) results_div_text = ''.join( [i for i in results_div_text if i.isalpha() or i.isnumeric()]) # first number is the one we are looking for # the second number should be the time it took # for google to return the search results # e.g Apie 54 500 000 rezult. (0,67 sek.) regex = r"[\d\s]+(?:\.(?:\s*\d){2,4})?" m = re.search(regex, results_div_text) results = int(m.group()) return results number_of_results = _parse_number_of_results( results_div.__getattribute__('text')) return number_of_results
def __init__(self, name, is_train, norm='instance', activation=tf.nn.leaky_relu): logger.info('Init Discriminator %s', name) self.name = name self.is_train = is_train self.norm = norm self.activation = activation self.reuse = False
def SvcStop(self): logger.info("[WIN SERVICE]stop service ddns....") # 先告诉SCM停止这个过程 self.ReportServiceStatus(win32service.SERVICE_STOP_PENDING) # 设置事件 win32event.SetEvent(self.hWaitStop) self.isAlive = False self._ddns_Loader.stop()
def new(self, **kw): # kw = {'total': int, ..., 'history': tuple} last_error = kw['history'][-1].error if last_error: error_name = last_error.__class__.__name__ # try get the url of the error else just include the whole error url = getattr(getattr(last_error, 'pool', None), 'host', None) or last_error logger.info(f'Request session: {url} {error_name}') return super(RetryWithCallback, self).new(**kw)
async def proxies_test(app): while True: await asyncio.sleep(0.1) redis_client = app['redis_client'] proxies = await redis_client.get_all() logger.info(f"proxies count: {len(proxies)}") proxytester = ProxyTester() sucess_count = await proxytester.all_proxies_test(app, proxies) logger.info(f"proxies valid count: {sucess_count}") await asyncio.sleep(3600 * 0.2)
def evaluate(sess, image_paths, embeddings, labels_batch, image_paths_placeholder, labels_placeholder, batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op, actual_issame, batch_size, nrof_folds, log_dir, step, summary_writer, embedding_size): start_time = time.time() # Run forward pass to calculate embeddings logger.debug('Running forward pass on LFW images: ') nrof_images = len(actual_issame) * 2 assert (len(image_paths) == nrof_images) labels_array = np.reshape(np.arange(nrof_images), (-1, 3)) image_paths_array = np.reshape(np.expand_dims(np.array(image_paths), 1), (-1, 3)) sess.run( enqueue_op, { image_paths_placeholder: image_paths_array, labels_placeholder: labels_array }) emb_array = np.zeros((nrof_images, embedding_size)) nrof_batches = int(np.ceil(nrof_images / batch_size)) label_check_array = np.zeros((nrof_images, )) for i in xrange(nrof_batches): batch_size = min(nrof_images - i * batch_size, batch_size) emb, lab = sess.run( [embeddings, labels_batch], feed_dict={ batch_size_placeholder: batch_size, learning_rate_placeholder: 0.0, phase_train_placeholder: False }) emb_array[lab, :] = emb label_check_array[lab] = 1 logger.debug('evaluate time: %.3f' % (time.time() - start_time)) assert (np.all(label_check_array == 1)) _, _, accuracy, val, val_std, far = lfw.evaluate(emb_array, actual_issame, nrof_folds=nrof_folds) logger.info('Accuracy: %1.3f+-%1.3f' % (np.mean(accuracy), np.std(accuracy))) logger.info('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far)) lfw_time = time.time() - start_time # Add validation loss and accuracy to summary summary = tf.Summary() #pylint: disable=maybe-no-member summary.value.add(tag='lfw/accuracy', simple_value=np.mean(accuracy)) summary.value.add(tag='lfw/val_rate', simple_value=val) summary.value.add(tag='time/lfw', simple_value=lfw_time) summary_writer.add_summary(summary, step) with open(os.path.join(log_dir, 'lfw_result.txt'), 'at') as f: f.write('%d\t%.5f\t%.5f\n' % (step, np.mean(accuracy), val))
def SendMail(self, evt): self.Render() replycontent = self.textctrl.GetValue() try: self.director.ReplyMail(self.Mail.MAIL_ID, replycontent, self.Mail.MAIL_SENDER, True) self.dq.put_nowait(self.director) except: logger.info('There is no mail to reply!') self.Jump()
def __init__(self, image_size, block_size, norm, is_train, name='Generator', activation=tf.nn.relu): self.name = name logger.info('Init Generator %s', name) self.norm = norm self.activation = activation self.image_size = image_size self.block_size = block_size self.is_train = is_train self.reuse = False
def get_twitter_users(input): twitter = Twitter(auth=OAuth( TWITTER_KEYS['access_key'], TWITTER_KEYS['access_secret'], TWITTER_KEYS['consumer_key'], TWITTER_KEYS['consumer_secret'])) logger.info('Sending a requests to twitter') results = twitter.users.search(q=input, count=20) output = {'num_users': len(results), 'users': []} for user in results[:5]: user_data = { 'username': user['screen_name'], 'followers_count': user['followers_count'], 'following_count': user['friends_count'], 'favourites_count': user['favourites_count'] } output['users'].append(user_data) return output
async def crawl_proxies(app): proxies_count = await app['redis_client'].get_count() logger.info(f"now there are {proxies_count} proxies.") while True: await asyncio.sleep(3) with cd("../"): logger.info(f'now in {os.getcwd()}') """ # 这两行代码在Python3.6尚未实现,需要Python3.7才能运行 process = await asyncio.create_subprocess_exec('scrapy list', stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE) stdout, stderr = await process.communicate() """ p = subprocess.run(['scrapy', 'list'], stdout=subprocess.PIPE, encoding='utf8') spiders = p.stdout.split() for spider in spiders: logger.info(spider) try: subprocess.run(['scrapy', 'crawl', spider], timeout=60 * 10) except subprocess.TimeoutExpired as e: logger.error(f'{spider} crawl TIMEOUT!') await asyncio.sleep(3) proxies_count = await app['redis_client'].get_count() await asyncio.sleep(3600 * 0.5)
def get_wiki_search(query, exact_match=True): # wikipedia has many websites for specific language # E. g. english wiki: https://en.wikipedia.org/w/api.php # the one used below is supposed to be international # and search the whole wiki, instead of wiki of specific country # also: Quotes around words mark an "exact phrase" search. # For parameters they are also needed to delimit multi-word input. # source: https://www.mediawiki.org/wiki/Help:CirrusSearch#Prefer_phrase_matches url = 'https://www.wikidata.org/w/api.php?' params = { 'action': 'query', 'format': 'json', 'list': 'search', 'srsearch': '"' + query + '"' if exact_match else query } logger.info('Sending a request to wikipedia') wiki_json = requests.get(url, params=params).json() output = {'items': wiki_json['query']['searchinfo']['totalhits'], 'wordcount': sum([i['wordcount'] for i in wiki_json['query']['search']])} return output
def run(self): err_recs = [] chain = self.clean_dump(False) for func_name, args, kw in chain: func = getattr(self, func_name) result = func(*args, **kw)() try: print(func_name) print(args, kw) except: pass if not result in [True, '[SUCCESS]', 1]: print(rec) err_recs.append(rec) if len(err_recs) == 0: logger.info('~No Error') else: logger.warning(err_recs) logger.warning(self.dump())
def OA_NEED_CARE_FIND_BUYER(opener, store): '''This is buggy''' store = '842' url = 'http://banggood.sellercube.com/eBayCaseAttempt/Grid' form = { 'ItemID': '', 'rp': '300', 'sortname': 'ImportTime', 'Site': '', 'DispatchUserID': '0', 'sortorder': 'desc', 'eBayName': store, 'IsRespone': '0', 'BuyerUserID': '', 'query': '', 'TurnOverUser': '', 'qtype': '', 'page': '1' } response = Firefox(opener).post(url, form) #id #item #buyer RE = re.compile( r'"id":"(\w*?)","cell":\["homesale_estore","(\d*?)",".*?",".*?",".*?","(.*?)"' ) RLT = RE.findall(response) logger.info(str(RLT)) return RLT
def save_variables_and_metagraph(sess, saver, summary_writer, model_dir, model_name, step): # Save the model checkpoint logger.info('Saving variables') start_time = time.time() checkpoint_path = os.path.join(model_dir, 'model-%s.ckpt' % model_name) saver.save(sess, checkpoint_path, global_step=step, write_meta_graph=False) save_time_variables = time.time() - start_time logger.info('Variables saved in %.2f seconds' % save_time_variables) metagraph_filename = os.path.join(model_dir, 'model-%s.meta' % model_name) save_time_metagraph = 0 if not os.path.exists(metagraph_filename): logger.info('Saving metagraph') start_time = time.time() saver.export_meta_graph(metagraph_filename) save_time_metagraph = time.time() - start_time logger.info('Metagraph saved in %.2f seconds' % save_time_metagraph) summary = tf.Summary() #pylint: disable=maybe-no-member summary.value.add(tag='time/save_variables', simple_value=save_time_variables) summary.value.add(tag='time/save_metagraph', simple_value=save_time_metagraph) summary_writer.add_summary(summary, step)
async def proxy_test(self, app, session, proxy): real_proxy = 'http://' + proxy try: async with session.get(TEST_URL, headers=HEADERS, proxy=real_proxy, timeout=TIMEOUT) as resp: if resp.status == 200: logger.info(f'{proxy} test useful') await app['redis_client'].score2max(proxy) self.sucess_count += 1 else: logger.info(f'{proxy} status code error: {resp.status}') await app['redis_client'].decrease(proxy) await asyncio.sleep(0.5) except Exception as e: logger.info(f'{proxy} test invalid') await app['redis_client'].decrease(proxy)
def start(self): while self._is_alive: try: if self._domain_id is None: self.get_domain() if len(self._record_dict) == 0: self.get_record() ip = self.get_ip() logger.info("LOCAL IP ADDR:" + ip) if self._current_ip != ip: for sub_domain, record in self._record_dict.items(): if ip == record['value']: continue if self.ddns(ip, record['id'], sub_domain): self._current_ip = ip logger.info("[DNSPOD]CHANGE %s DDNS IP [%s --> %s]" % (sub_domain, current_ip, ip)) else: logger.info("[DNSPOD]REFRESH DDNS FAIL") except Exception as e: logger.error(e) time.sleep(30)
async def fetch(session, url, proxy): # setting a timeout for bad proxies timeout = aiohttp.ClientTimeout(total=5) # setting 5 retries in case of proxy fail # sometimes the url (due to proxies) is unreachable for i in range(5): logger.info(f'Gathering single async request: {i}') try: async with session.get(url, headers=headers, proxy=proxy, timeout=timeout) as response: logger.info( f'Gathering single async request: successfully received a response' ) return await response.text() except Exception as e: # TODO: setting to logger.exception breaks everything !? # by not continuing the for loop and stopping just before the below line logger.info(f'Single async requests exception: {repr(e)}')
def start(self): while self._is_alive: try: if self._domain_id is None: self.get_domain() if len(self._record_dict) == 0: self.get_record() ip = self.get_ip() logger.info("LOCAL IP ADDR:" + ip) if self._current_ip != ip: for sub_domain, record in self._record_dict.items(): if ip == record['value']: continue if self.ddns(ip, record['id'], sub_domain): self._current_ip = ip logger.info( "[DNSPOD]CHANGE %s DDNS IP [%s --> %s]" % (sub_domain, current_ip, ip)) else: logger.info("[DNSPOD]REFRESH DDNS FAIL") except Exception as e: logger.error(e) time.sleep(30)
def main(setType, nn, a=None, b=None): if (setType == 'train'): folders = glob.glob('crowdai_train/crowdai/*') # Get folder names parts = [Path(folder).parts[2] for folder in folders] classes = [ re.search(r'c_(.*)', part, re.I | re.M).group(1) for part in parts ] #Get the classes train_data = dict() train_data['image'] = [] train_data['label'] = [] train_data['location'] = [] logger.info('Creating train arrays for {} ...'.format(nn)) if (nn == 'alexnet'): dim = 227 elif (nn == 'vgg16'): dim = 224 j = 0 # Classes index while (j < len(classes)): lbl = classes[j] logger.info('Opening folder: {} ...'.format(lbl)) folder = folders[j] if ((a is None) & (b is None)): imgs = glob.glob(folder + '/*.jpg') else: imgs = glob.glob(folder + '/*.jpg')[a:b] #b - a pics for img in imgs: logger.info('Creating data point {} ...'.format(img)) rd = cv2.imread(img) arr = cv2.resize(rd, (dim, dim)) arr = arr.astype('uint64') train_data['image'].append(arr) train_data['label'].append(lbl) train_data['location'].append(folder) logger.info('Finished creating data point {}.'.format(img)) j = j + 1 logger.info('Finished creating arrays. Writing to DF...') np.set_printoptions(threshold=np.inf) df = pd.DataFrame.from_dict(train_data, orient='columns') logger.info('Finished writing to DF.') return df # Deal with test data elif (setType == 'test'): test_data = dict() test_data['image'] = [] imgs = glob.glob('crowdai_test/crowdai/*.jpg') logger.info('Creating test arrays...') if (nn == 'alexnet'): dim = 227 elif (nn == 'vgg16'): dim = 224 for img in imgs: logger.info('Creating data point {} ...'.format(img)) rd = cv2.imread(img) arr = cv2.resize(rd, (dim, dim)) arr = arr.astype('uint64') test_data['image'].append(arr) logger.info('Finished creating data point {} ...'.format(img)) logger.info('Finished creating arrays. Writing to DF...') np.set_printoptions(threshold=np.inf) df = pd.DataFrame.from_dict(train_data, orient='columns') logger.info('Finished writing to DF.') return df
def run(args): logger.info('Read data:') # get_data: 使用h5py进行数据载入,需要改成tensorflow中的内容 train_A, train_B, test_A, test_B = get_data(args.task, args.image_size) logger.info('Build graph:') model = CycleGAN(args) # 获取需要存储的变量 variables_to_save = tf.global_variables() init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() # var_list不用了吗 saver = FastSaver(variables_to_save) logger.info('Trainable vars:') # .....获取了参数列表然后写入了log var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) # 这个地方比价重要! if args.load_model != '': model_name = args.load_model else: model_name = '{}_{}'.format( args.task, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) logdir = './logs' makedirs(logdir) logdir = os.path.join(logdir, model_name) logger.info('Events directory: %s', logdir) summary_writer = tf.summary.FileWriter(logdir) def init_fn(sess): logger.info('Initializing all parameters.') sess.run(init_all_op) sv = tf.train.Supervisor( is_chief=True, logdir=logdir, saver=saver, summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=model.global_step, save_model_secs=300, save_summaries_secs=30) if args.train: logger.info("Starting training session.") with sv.managed_session() as sess: model.train(sess, summary_writer, train_A, train_B) logger.info("Starting testing session.") with sv.managed_session() as sess: base_dir = os.path.join('results', model_name) makedirs(base_dir) model.test(sess, test_A, test_B, base_dir)
def init_fn(sess): logger.info('Initializing all parameters.') sess.run(init_all_op)
""" # Training in batches because of the dataset size # p = 0 # q = 20 # iteration = 0 # top = 125 # The total pics in the class with fewest images + 1 theModel = sys.argv[1] df = preprocessing.main('train', theModel, None, None) #Read all data X = np.array(df['image'].tolist()) # Generate array of arrays for X, and array of vectors for y df['vector_labels'] = pd.get_dummies(df['label']).values.tolist() Y = np.array(df['vector_labels'].tolist()) logger.info('X.shape: {}'.format(X.shape)) logger.info('Y.shape: {}'.format(Y.shape)) logger.info('Starting training...') train(X, Y, iteration, theModel) # while (p < top): # logger.info('Reading DF...') # df = preprocessing.main('train', theModel, p, q) #Let's do (q - p) pics from all classes per iter # X = np.array(df['image'].tolist()) # Generate array of arrays for X, and array of vectors for y # df['vector_labels'] = pd.get_dummies(df['label']).values.tolist() # Y = np.array(df['vector_labels'].tolist()) # logger.info('X.shape: {}'.format(X.shape)) # logger.info('Y.shape: {}'.format(Y.shape))
if __name__ == '__main__': """ Due to RAM constraints, I'm reading the images in batches of 10 i.e. 10 images per class per iteration. If access to GPU or a massive RAM, you can: 1. Set p = None and q = None 2. Set p = 0 and q = (n + 1) where the n is the number of pictures to read per iteration Make sure to update p and q by replacing 10 with n """ # Training in batches because of the dataset size p = 0 q = 21 iteration = 0 top = 125 # The total pics in the class with fewest images + 1 logger.info('Reading DF...') theModel = sys.argv[1] if((p is None) || (q is None)): df = preprocessing.main('train', theModel, None, None) #Read all data X = np.array(df['image'].tolist()) # Generate array of arrays for X, and array of vectors for y df['vector_labels'] = pd.get_dummies(df['label']).values.tolist() Y = np.array(df['vector_labels'].tolist()) logger.info('X.shape: {}'.format(X.shape)) logger.info('Y.shape: {}'.format(Y.shape)) logger.info('Starting training...') train(X, Y, iteration, theModel)
def train(X, Y, iteration, whichOne): XTrain, XTest, yTrain, yTest = train_test_split(X, Y, test_size=0.33, random_state=42) if(whichOne == 'alexnet'): if(os.path.exists('saved_models/agrix_alexnet.h5')): logger.info('Reading alexnet from file. Time to improve!!') model = load_model('saved_models/agrix_alexnet.h5') else: logger.info('Instantiating model for the first time') model = model_defs.alexnet() elif(whichOne == 'vgg16'): if(os.path.exists('saved_models/agrix_vgg16.h5')): logger.info('Reading vgg16 from file. Time to improve!!') model = load_model('saved_models/agrix_vgg16.h5') else: logger.info('Instantiating model for the first time') model = model_defs.vgg16() # Run model model.fit(XTrain, yTrain, batch_size = 64, epochs = 100, validation_split = 0.33, shuffle = True, verbose = 1) scores = model.evaluate(XTest, yTest, batch_size = 64, verbose = 1) logger.info("VALIDATION SCORE: {}: {}%".format(model.metrics_names[1], scores[1] * 100)) if(whichOne == 'alexnet'): model.save('saved_models/agrix_alexnet.h5') logger.info('Saved alexnet to file.') elif(whichOne == 'vgg16'): model.save('saved_models/agrix_vgg16.h5') logger.info('Saved vgg16 to file.')
def get_google_search_response(person_name, exact_match, proxies, country_code): # set params params = {'as_epq' if exact_match else 'q': person_name.encode('utf8')} # make sure to set google search country code because # when using proxies the google results will depend on # the random country the proxy is located at and # the results will differ with every random proxy call # also keep in mind that including this parameter (as well as others, likely) # will increase the prob of triggering bot detection # so using this without a proxy will quickly result # in google banning the ip address and asking for recaptcha # set lr and cr params, maybe both of the will result # in actually simulating a search from specified country # source: https://github.com/MarioVilas/googlesearch/blob/master/googlesearch/__init__.py # as of 2021 the request triggers some kind of google protection # by not rendering the full page in html. Depending on the exact params set # in the html returned it says "Jei per kelias sekundes nebūsite nukreipti, <...>" # or just returns a different html structure that does not include div slim_appbar. # div slim_appbar contains the number of total results and we want to parse it. # # For future reference, the following does not help: # following the url provided together with "if you are not redirected within a few". # fixing proxy location (in private.py) e.g. US only. # do not setting cr lr params. if proxies: pass # params['cr'] = 'us' # params['lr'] = 'lang_' + 'us' if country_code: pass # params['cr'] = country_code # params['lr'] = 'lang_' + country_code # set headers - this is important!!! # if headers are not set google does not return # the number of search results and none of the divs # responsible for storing number of results are there. # Basically, the structure of html is totally different. # IMPORTANT: the above holds for requests sent directly # IMPORTANT: and for requests send through a proxy. # UserAgent() is a heroku app that sometimes fails # lets save a list of browsers for headers locally, # so that we don't have to call the heroku app # again and again every time. # Randomising from locally stored browsers does # increase the rate of captcha and invalid responses # For now lets fall back to UserAgent and investigate # the reasons later on. headers = {'User-Agent': UserAgent().random} # headers = {'User-Agent': random.choice(browsers)} # make the request url = 'https://www.google.com/search' response = requests_retry_session().get(url, params=params, headers=headers, proxies=proxies) # if recaptcha in the response, the client that sent the request # is blacklisted so lets return False if 'https://www.google.com/recaptcha/api.js' in response.text: logger.info('Received Captcha request (google search)') return False logger.info('Received a valid response (google search)') return response
def main(args): network = importlib.import_module(args.model_def) subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S') log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir) if not os.path.isdir( log_dir): # Create the log directory if it doesn't exist os.makedirs(log_dir) model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir) if not os.path.isdir( model_dir): # Create the model directory if it doesn't exist os.makedirs(model_dir) model_dir_plus = os.path.join( os.path.expanduser(args.models_plus_base_dir), subdir) if not os.path.isdir( model_dir_plus): # Create the model directory if it doesn't exist os.makedirs(model_dir_plus) # Write arguments to a text file facenet.write_arguments_to_file(args, os.path.join(log_dir, 'arguments.txt')) # Store some git revision info in a text file in the log directory src_path, _ = os.path.split(os.path.realpath(__file__)) facenet.store_revision_info(src_path, log_dir, ' '.join(sys.argv)) np.random.seed(seed=args.seed) train_set_ID = facenet.get_dataset(args.data_dir_ID) train_set_camera = facenet.get_dataset(args.data_dir_camera) logger.info('Model directory: %s' % model_dir) logger.info('Log directory: %s' % log_dir) if args.pretrained_model: logger.info('Pre-trained model: %s' % os.path.expanduser(args.pretrained_model)) if args.lfw_dir: logger.info('LFW directory: %s' % args.lfw_dir) # Read the file containing the pairs used for testing pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs)) # Get the paths for the corresponding images lfw_paths, actual_issame = lfw.get_paths( os.path.expanduser(args.lfw_dir), pairs, args.lfw_file_ext) # associative, fengchen assoc = Associative(network, args) with tf.Graph().as_default(): tf.set_random_seed(args.seed) global_step = tf.Variable(0, trainable=False) # Placeholder for the learning rate learning_rate_placeholder = tf.placeholder(tf.float32, name='learning_rate') batch_size_placeholder = tf.placeholder(tf.int32, name='batch_size') phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train') image_paths_placeholder_ID = tf.placeholder(tf.string, shape=(None, 3), name='image_paths_ID') image_paths_placeholder_camera = tf.placeholder( tf.string, shape=(None, 3), name='image_paths_camera') image_paths_placeholder_valid = tf.placeholder( tf.string, shape=(None, 3), name='image_paths_valid') labels_placeholder_ID = tf.placeholder(tf.int64, shape=(None, 3), name='labels_ID') labels_placeholder_camera = tf.placeholder(tf.int64, shape=(None, 3), name='labels_camera') labels_placeholder_valid = tf.placeholder(tf.int64, shape=(None, 3), name='labels_valid') input_queue_ID = data_flow_ops.FIFOQueue(capacity=100000, dtypes=[tf.string, tf.int64], shapes=[(3, ), (3, )], shared_name=None, name=None) input_queue_camera = data_flow_ops.FIFOQueue( capacity=100000, dtypes=[tf.string, tf.int64], shapes=[(3, ), (3, )], shared_name=None, name=None) input_queue_valid = data_flow_ops.FIFOQueue( capacity=100000, dtypes=[tf.string, tf.int64], shapes=[(3, ), (3, )], shared_name=None, name=None) enqueue_op_ID = input_queue_ID.enqueue_many( [image_paths_placeholder_ID, labels_placeholder_ID]) enqueue_op_camera = input_queue_camera.enqueue_many( [image_paths_placeholder_camera, labels_placeholder_camera]) enqueue_op_valid = input_queue_valid.enqueue_many( [image_paths_placeholder_valid, labels_placeholder_valid]) nrof_preprocess_threads = 4 images_and_labels_ID = [] images_and_labels_camera = [] images_and_labels_valid = [] for _ in range(nrof_preprocess_threads): filenames, label = input_queue_ID.dequeue() images = [] for filename in tf.unstack(filenames): file_contents = tf.read_file(filename) image = tf.image.decode_image(file_contents, channels=3) if args.random_crop: image = tf.random_crop( image, [args.image_size, args.image_size, 3]) else: image = tf.image.resize_image_with_crop_or_pad( image, args.image_size, args.image_size) if args.random_flip: image = tf.image.random_flip_left_right(image) #pylint: disable=no-member image.set_shape((args.image_size, args.image_size, 3)) images.append(tf.image.per_image_standardization(image)) images_and_labels_ID.append([images, label]) for _ in range(nrof_preprocess_threads): filenames, label = input_queue_camera.dequeue() images = [] for filename in tf.unstack(filenames): file_contents = tf.read_file(filename) image = tf.image.decode_image(file_contents, channels=3) if args.random_crop: image = tf.random_crop( image, [args.image_size, args.image_size, 3]) else: image = tf.image.resize_image_with_crop_or_pad( image, args.image_size, args.image_size) if args.random_flip: image = tf.image.random_flip_left_right(image) # pylint: disable=no-member image.set_shape((args.image_size, args.image_size, 3)) images.append(tf.image.per_image_standardization(image)) images_and_labels_camera.append([images, label]) for _ in range(nrof_preprocess_threads): filenames, label = input_queue_valid.dequeue() images = [] for filename in tf.unstack(filenames): file_contents = tf.read_file(filename) image = tf.image.decode_image(file_contents, channels=3) if args.random_crop: image = tf.random_crop( image, [args.image_size, args.image_size, 3]) else: image = tf.image.resize_image_with_crop_or_pad( image, args.image_size, args.image_size) if args.random_flip: image = tf.image.random_flip_left_right(image) # pylint: disable=no-member image.set_shape((args.image_size, args.image_size, 3)) images.append(tf.image.per_image_standardization(image)) images_and_labels_valid.append([images, label]) image_batch_ID, labels_batch_ID = tf.train.batch_join( images_and_labels_ID, batch_size=batch_size_placeholder, shapes=[(args.image_size, args.image_size, 3), ()], enqueue_many=True, capacity=4 * nrof_preprocess_threads * args.batch_size, allow_smaller_final_batch=True) image_batch_ID = tf.identity(image_batch_ID, 'image_batch_ID') image_batch_ID = tf.identity(image_batch_ID, 'input_ID') labels_batch_ID = tf.identity(labels_batch_ID, 'label_batch_ID') image_batch_camera, labels_batch_camera = tf.train.batch_join( images_and_labels_camera, batch_size=batch_size_placeholder, shapes=[(args.image_size, args.image_size, 3), ()], enqueue_many=True, capacity=4 * nrof_preprocess_threads * args.batch_size, allow_smaller_final_batch=True) image_batch_camera = tf.identity(image_batch_camera, 'image_batch_camera') image_batch_camera = tf.identity(image_batch_camera, 'input_camera') labels_batch_camera = tf.identity(labels_batch_camera, 'label_batch_camera') image_batch_valid, labels_batch_valid = tf.train.batch_join( images_and_labels_valid, batch_size=batch_size_placeholder, shapes=[(args.image_size, args.image_size, 3), ()], enqueue_many=True, capacity=4 * nrof_preprocess_threads * args.batch_size, allow_smaller_final_batch=True) image_batch_valid = tf.identity(image_batch_valid, 'image_batch_valid') labels_batch_valid = tf.identity(labels_batch_valid, 'label_batch_valid') # Build the inference graph prelogits_ID, _, _, _, _ = network.inference( image_batch_ID, args.keep_probability, phase_train=phase_train_placeholder, bottleneck_layer_size=args.embedding_size, weight_decay=args.weight_decay) prelogits_camera, _, _, _, _ = network.inference( image_batch_camera, args.keep_probability, phase_train=phase_train_placeholder, bottleneck_layer_size=args.embedding_size, weight_decay=args.weight_decay) prelogits_valid, _, _, _, _ = network.inference( image_batch_valid, args.keep_probability, phase_train=phase_train_placeholder, bottleneck_layer_size=args.embedding_size, weight_decay=args.weight_decay) embeddings_ID = tf.nn.l2_normalize(prelogits_ID, 1, 1e-10, name='embeddings_ID') embeddings_camera = tf.nn.l2_normalize(prelogits_camera, 1, 1e-10, name='embeddings_camera') embeddings_valid = tf.nn.l2_normalize(prelogits_valid, 1, 1e-10, name='embeddings_valid') # Split embeddings into anchor, positive and negative and calculate triplet loss anchor_ID, positive_ID, negative_ID = tf.unstack( tf.reshape(embeddings_ID, [-1, 3, args.embedding_size]), 3, 1) triplet_loss_ID = facenet.triplet_loss(anchor_ID, positive_ID, negative_ID, args.alpha) anchor_camera, positive_camera, negative_camera = tf.unstack( tf.reshape(embeddings_camera, [-1, 3, args.embedding_size]), 3, 1) triplet_loss_camera = facenet.triplet_loss(anchor_camera, positive_camera, negative_camera, args.alpha) images_mmd_ID, images_mmd_camera, _, _ = assoc.get_image_and_label_dann( ) feature_map3_ID, _, _, _, _ = network.inference( images_mmd_ID, args.keep_probability, phase_train=phase_train_placeholder, bottleneck_layer_size=args.embedding_size, weight_decay=args.weight_decay) feature_map3_camera, _, _, _, _ = network.inference( images_mmd_camera, args.keep_probability, phase_train=phase_train_placeholder, bottleneck_layer_size=args.embedding_size, weight_decay=args.weight_decay) saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10) # feature_map3_ID = tf.nn.l2_normalize(feature_map3_ID, 1, 1e-10, name='feature_map3_ID') # feature_map3_camera = tf.nn.l2_normalize(feature_map3_camera, 1, 1e-10, name='feature_map3_camera') dann_loss = 0.1 * losses.dann_loss(feature_map3_ID, feature_map3_camera, 1) # logger.debug("feature_map3_ID: %s, feature_map3_camera: %s" % (feature_map3_ID.get_shape(), feature_map3_camera.get_shape())) learning_rate = tf.train.exponential_decay( learning_rate_placeholder, global_step, args.learning_rate_decay_epochs * args.epoch_size, args.learning_rate_decay_factor, staircase=True) tf.summary.scalar('learning_rate', learning_rate) # # associative, fengchen # associative_loss = assoc.loss() * 10 # Calculate the total losses regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) triplet_loss = tf.add_n([triplet_loss_ID] + [triplet_loss_camera] + regularization_losses, name='triplet_loss') # associative, fengchen loss_total = tf.add_n([triplet_loss_ID] + [triplet_loss_camera] + [dann_loss] + regularization_losses, name='loss_total') saver_plus = tf.train.Saver(tf.trainable_variables(), max_to_keep=10) # Build a Graph that trains the model with one batch of examples and updates the model parameters train_op = facenet.train(loss_total, global_step, args.optimizer, learning_rate, args.moving_average_decay, tf.global_variables()) train_op_triplet = facenet.train(triplet_loss, global_step, args.optimizer, learning_rate, args.moving_average_decay, tf.global_variables()) # Start running operations on the Graph. gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=args.gpu_memory_fraction) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) # Initialize variables sess.run(tf.global_variables_initializer(), feed_dict={phase_train_placeholder: True}) sess.run(tf.local_variables_initializer(), feed_dict={phase_train_placeholder: True}) summary_writer = tf.summary.FileWriter(log_dir, sess.graph) coord = tf.train.Coordinator() tf.train.start_queue_runners(coord=coord, sess=sess) with sess.as_default(): if args.pretrained_model: logger.info('Restoring pretrained model: %s' % args.pretrained_model) # saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=3) saver.restore(sess, os.path.expanduser(args.pretrained_model)) # saver = tf.train.Saver(tf.global_variables(), max_to_keep=3) # Training and validation loop epoch = 0 while epoch < args.max_nrof_epochs: step = sess.run(global_step, feed_dict=None) epoch = step // args.epoch_size # Train for one epoch train(args, sess, train_set_ID, train_set_camera, epoch, image_paths_placeholder_ID, image_paths_placeholder_camera, labels_placeholder_ID, labels_placeholder_camera, labels_batch_ID, labels_batch_camera, batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op_ID, enqueue_op_camera, global_step, embeddings_ID, embeddings_camera, triplet_loss, loss_total, triplet_loss_ID, triplet_loss_camera, dann_loss, regularization_losses, train_op, train_op_triplet, summary_writer, args.learning_rate_schedule_file, args.embedding_size) # Save variables and the metagraph if it doesn't exist already save_variables_and_metagraph(sess, saver, summary_writer, model_dir, subdir, step) save_variables_and_metagraph(sess, saver_plus, summary_writer, model_dir_plus, subdir, step) # Evaluate on LFW if args.lfw_dir: evaluate(sess, lfw_paths, embeddings_valid, labels_batch_valid, image_paths_placeholder_valid, labels_placeholder_valid, batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op_valid, actual_issame, args.batch_size, args.lfw_nrof_folds, log_dir, step, summary_writer, args.embedding_size) return model_dir
def wrapper(*args, **kwargs): for i in range(max_tries): logger.info(f'Trying {fn.__name__} {i}') output = fn(*args, **kwargs) if output: return output
def google_translate(google_data, proxies): # https://github.com/ssut/py-googletrans # TODO: do in one batch by giving an array # collect items to translate snippets = [item['snippet'] for item in google_data['items']] titles = [item['title'] for item in google_data['items']] # okay, this is pretty ugly but here is the idea: # I want to combine whole text into one string to send only one request to google translate # due to this snippets and titles are combined with special separator *||* (with hope that it will not break) # later on all is reorganized back to snippets and titles # text_to_translate = titles + snippets text_to_translate = [ ' ||| '.join([ title + ' ||| ' + snippet for title, snippet in zip(titles, snippets) ]) ] # clear non alpha num # e.g ☀ throws error in translator # re is fastest # https://stackoverflow.com/questions/1276764/stripping-everything-but-alphanumeric-chars-from-a-string-in-python # keep . and , and add else if needed text_to_translate = [ re.sub(r"[^\w.,|']", ' ', text) for text in text_to_translate ] # translate # Translator creates its own requests session, so lets # modify it to retry on fail connection / other errors # else, the code will fail at this point with broken conn translator = Translator(proxies=proxies, timeout=5) # the following would overwrite a couple of things # also would force token acquisition to be done via proxy # none of this is good, so lets not do this and use the default session # translator.session = requests_retry_session(retries=0, timeout=5) # temp fix: https://github.com/ssut/py-googletrans/issues/234 # from the looks of it, it seems the issue will be soon fixed # also this is good for cases when proxy connection fails for _ in range(5): try: translated = [ item.text for item in translator.translate(text_to_translate, dest='en') ] logger.info('Successfully google translated text') break except Exception as e: logger.info(f'google translate error {_}') translator = Translator(proxies=proxies, timeout=5) # ungroup and split back to snippets and titles titles_translated = translated[0].split('|||')[0::2] snippets_translated = translated[0].split('|||')[1::2] # assign for item, snippet, title in zip(google_data['items'], snippets_translated, titles_translated): item['snippet'] = snippet item['title'] = title return google_data