示例#1
0
def get_flair_entities(input, score_threshold=0.9):

    sentence = Sentence(input, use_tokenizer=True)
    model.predict(sentence)

    # refactor flair output
    entities = []
    prev_end_pos = 0
    prev_entity_part = ''
    for entity in sentence.to_dict(tag_type='ner')['entities']:

        if entity['labels'][0]._score < score_threshold:
            continue

        logger.info('flair entity detected: ' + str(entity))
        if prev_end_pos + 1 == entity['start_pos']:
            del entities[-1]
            final_entity = prev_entity_part + ' ' + entity['text']
        else:
            final_entity = entity['text']

        entities.append(final_entity.strip())
        prev_end_pos = entity['end_pos']
        prev_entity_part += ' ' + entity['text']

    return entities
示例#2
0
 async def fetch_all(urls, loop):
     async with aiohttp.ClientSession(loop=loop) as session:
         logger.info('Gathering async requests')
         results = await asyncio.gather(
             *[fetch(session, url, proxy) for url in urls],
             return_exceptions=True)
         return results
示例#3
0
 def SvcDoRun(self):
     logger.info("[WIN SERVICE]start service ddns....")
     self._ddns_Loader.start()
     while self.isAlive:
         # time.sleep(1)
         # 等待服务被停止
         win32event.WaitForSingleObject(self.hWaitStop, win32event.INFINITE)
示例#4
0
def get_google_search_result_count(person_name, exact_match, proxies,
                                   country_code):
    """
    Returns either bool False or int as number of search results found.
    False will trigger the decorator to run this exact function again
    with a hope to get the function to return an int.

    Major problem with it is that the results are chaotic and not
    reproducible because google returns different structures depending
    on the random proxy though which the requests is being sent.
    """

    response = get_google_search_response(person_name, exact_match, proxies,
                                          country_code)

    # parse the results and extract the part of html
    # which contains the number of search results found
    soup = BeautifulSoup(response.text, 'html.parser')
    results_div = soup.find('div',
                            attrs={'id': ['resultStats', 'slim_appbar']})

    # TODO: what if the results are in fact 0? e.g. sosicc cequel tycoonkingz
    #  have to find a way to separate this case from other cases where we return False.
    #  It should look something like this: if X (no results found) is in soup: return 0
    if not results_div or not results_div.__getattribute__('text'):
        logger.info(
            'Google search does not contain the search results div / or there are 0 results'
        )
        return False

    def _parse_number_of_results(results_div_text):
        """
        parse total number of search results given the text
        inside the div that should hold the number.

        Basically the function finds numbers in the string and
        returns the first number found as int
        """

        # clean the string by removing space and commas
        # what we want is glued string from which we can
        # easily extract the first number found
        results_div_text = unidecode.unidecode(results_div_text)
        results_div_text = ''.join(
            [i for i in results_div_text if i.isalpha() or i.isnumeric()])

        # first number is the one we are looking for
        # the second number should be the time it took
        # for google to return the search results
        # e.g Apie 54 500 000 rezult. (0,67 sek.)
        regex = r"[\d\s]+(?:\.(?:\s*\d){2,4})?"
        m = re.search(regex, results_div_text)
        results = int(m.group())

        return results

    number_of_results = _parse_number_of_results(
        results_div.__getattribute__('text'))
    return number_of_results
示例#5
0
    def __init__(self, name, is_train, norm='instance', activation=tf.nn.leaky_relu):
        logger.info('Init Discriminator %s', name)

        self.name = name
        self.is_train = is_train
        self.norm = norm
        self.activation = activation
        self.reuse = False
示例#6
0
 def SvcStop(self):
     logger.info("[WIN SERVICE]stop service ddns....")
     # 先告诉SCM停止这个过程
     self.ReportServiceStatus(win32service.SERVICE_STOP_PENDING)
     # 设置事件
     win32event.SetEvent(self.hWaitStop)
     self.isAlive = False
     self._ddns_Loader.stop()
示例#7
0
 def new(self, **kw):
     # kw = {'total': int, ..., 'history': tuple}
     last_error = kw['history'][-1].error
     if last_error:
         error_name = last_error.__class__.__name__
         # try get the url of the error else just include the whole error
         url = getattr(getattr(last_error, 'pool', None), 'host',
                       None) or last_error
         logger.info(f'Request session: {url} {error_name}')
     return super(RetryWithCallback, self).new(**kw)
示例#8
0
文件: db.py 项目: lucays/ProxyPool
async def proxies_test(app):
    while True:
        await asyncio.sleep(0.1)
        redis_client = app['redis_client']
        proxies = await redis_client.get_all()
        logger.info(f"proxies count: {len(proxies)}")
        proxytester = ProxyTester()
        sucess_count = await proxytester.all_proxies_test(app, proxies)
        logger.info(f"proxies valid count: {sucess_count}")
        await asyncio.sleep(3600 * 0.2)
def evaluate(sess, image_paths, embeddings, labels_batch,
             image_paths_placeholder, labels_placeholder,
             batch_size_placeholder, learning_rate_placeholder,
             phase_train_placeholder, enqueue_op, actual_issame, batch_size,
             nrof_folds, log_dir, step, summary_writer, embedding_size):
    start_time = time.time()
    # Run forward pass to calculate embeddings
    logger.debug('Running forward pass on LFW images: ')

    nrof_images = len(actual_issame) * 2
    assert (len(image_paths) == nrof_images)
    labels_array = np.reshape(np.arange(nrof_images), (-1, 3))
    image_paths_array = np.reshape(np.expand_dims(np.array(image_paths), 1),
                                   (-1, 3))
    sess.run(
        enqueue_op, {
            image_paths_placeholder: image_paths_array,
            labels_placeholder: labels_array
        })
    emb_array = np.zeros((nrof_images, embedding_size))
    nrof_batches = int(np.ceil(nrof_images / batch_size))
    label_check_array = np.zeros((nrof_images, ))
    for i in xrange(nrof_batches):
        batch_size = min(nrof_images - i * batch_size, batch_size)
        emb, lab = sess.run(
            [embeddings, labels_batch],
            feed_dict={
                batch_size_placeholder: batch_size,
                learning_rate_placeholder: 0.0,
                phase_train_placeholder: False
            })
        emb_array[lab, :] = emb
        label_check_array[lab] = 1
    logger.debug('evaluate time: %.3f' % (time.time() - start_time))

    assert (np.all(label_check_array == 1))

    _, _, accuracy, val, val_std, far = lfw.evaluate(emb_array,
                                                     actual_issame,
                                                     nrof_folds=nrof_folds)

    logger.info('Accuracy: %1.3f+-%1.3f' %
                (np.mean(accuracy), np.std(accuracy)))
    logger.info('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' %
                (val, val_std, far))
    lfw_time = time.time() - start_time
    # Add validation loss and accuracy to summary
    summary = tf.Summary()
    #pylint: disable=maybe-no-member
    summary.value.add(tag='lfw/accuracy', simple_value=np.mean(accuracy))
    summary.value.add(tag='lfw/val_rate', simple_value=val)
    summary.value.add(tag='time/lfw', simple_value=lfw_time)
    summary_writer.add_summary(summary, step)
    with open(os.path.join(log_dir, 'lfw_result.txt'), 'at') as f:
        f.write('%d\t%.5f\t%.5f\n' % (step, np.mean(accuracy), val))
示例#10
0
文件: main.py 项目: lastcc/OAHelper
    def SendMail(self, evt):
        self.Render()
        replycontent = self.textctrl.GetValue()
        try:
            self.director.ReplyMail(self.Mail.MAIL_ID, replycontent,
                                    self.Mail.MAIL_SENDER, True)
            self.dq.put_nowait(self.director)
        except:
            logger.info('There is no mail to reply!')

        self.Jump()
示例#11
0
 def __init__(self,
              image_size,
              block_size,
              norm,
              is_train,
              name='Generator',
              activation=tf.nn.relu):
     self.name = name
     logger.info('Init Generator %s', name)
     self.norm = norm
     self.activation = activation
     self.image_size = image_size
     self.block_size = block_size
     self.is_train = is_train
     self.reuse = False
示例#12
0
def get_twitter_users(input):

    twitter = Twitter(auth=OAuth(
        TWITTER_KEYS['access_key'], TWITTER_KEYS['access_secret'],
        TWITTER_KEYS['consumer_key'], TWITTER_KEYS['consumer_secret']))

    logger.info('Sending a requests to twitter')
    results = twitter.users.search(q=input, count=20)

    output = {'num_users': len(results), 'users': []}
    for user in results[:5]:
        user_data = {
            'username': user['screen_name'],
            'followers_count': user['followers_count'],
            'following_count': user['friends_count'],
            'favourites_count': user['favourites_count']
        }
        output['users'].append(user_data)

    return output
示例#13
0
文件: db.py 项目: lucays/ProxyPool
async def crawl_proxies(app):
    proxies_count = await app['redis_client'].get_count()
    logger.info(f"now there are {proxies_count} proxies.")
    while True:
        await asyncio.sleep(3)
        with cd("../"):
            logger.info(f'now in {os.getcwd()}')
            """
            # 这两行代码在Python3.6尚未实现,需要Python3.7才能运行
            process = await asyncio.create_subprocess_exec('scrapy list', stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
            stdout, stderr = await process.communicate()
            """
            p = subprocess.run(['scrapy', 'list'],
                               stdout=subprocess.PIPE,
                               encoding='utf8')
            spiders = p.stdout.split()

            for spider in spiders:
                logger.info(spider)
                try:
                    subprocess.run(['scrapy', 'crawl', spider],
                                   timeout=60 * 10)
                except subprocess.TimeoutExpired as e:
                    logger.error(f'{spider} crawl TIMEOUT!')
                await asyncio.sleep(3)
        proxies_count = await app['redis_client'].get_count()
        await asyncio.sleep(3600 * 0.5)
示例#14
0
def get_wiki_search(query, exact_match=True):

    # wikipedia has many websites for specific language
    # E. g. english wiki: https://en.wikipedia.org/w/api.php
    # the one used below is supposed to be international
    # and search the whole wiki, instead of wiki of specific country
    # also: Quotes around words mark an "exact phrase" search.
    # For parameters they are also needed to delimit multi-word input.
    # source: https://www.mediawiki.org/wiki/Help:CirrusSearch#Prefer_phrase_matches
    url = 'https://www.wikidata.org/w/api.php?'
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'search',
        'srsearch': '"' + query + '"' if exact_match else query
    }
    logger.info('Sending a request to wikipedia')
    wiki_json = requests.get(url, params=params).json()

    output = {'items': wiki_json['query']['searchinfo']['totalhits'],
              'wordcount': sum([i['wordcount'] for i in wiki_json['query']['search']])}

    return output
示例#15
0
文件: main.py 项目: lastcc/OAHelper
    def run(self):

        err_recs = []
        chain = self.clean_dump(False)

        for func_name, args, kw in chain:
            func = getattr(self, func_name)
            result = func(*args, **kw)()

            try:
                print(func_name)
                print(args, kw)
            except:
                pass

            if not result in [True, '[SUCCESS]', 1]:
                print(rec)
                err_recs.append(rec)

        if len(err_recs) == 0:
            logger.info('~No Error')
        else:
            logger.warning(err_recs)
            logger.warning(self.dump())
示例#16
0
def OA_NEED_CARE_FIND_BUYER(opener, store):
    '''This is buggy'''
    store = '842'

    url = 'http://banggood.sellercube.com/eBayCaseAttempt/Grid'
    form = {
        'ItemID': '',
        'rp': '300',
        'sortname': 'ImportTime',
        'Site': '',
        'DispatchUserID': '0',
        'sortorder': 'desc',
        'eBayName': store,
        'IsRespone': '0',
        'BuyerUserID': '',
        'query': '',
        'TurnOverUser': '',
        'qtype': '',
        'page': '1'
    }

    response = Firefox(opener).post(url, form)

    #id
    #item
    #buyer

    RE = re.compile(
        r'"id":"(\w*?)","cell":\["homesale_estore","(\d*?)",".*?",".*?",".*?","(.*?)"'
    )

    RLT = RE.findall(response)

    logger.info(str(RLT))

    return RLT
示例#17
0
def save_variables_and_metagraph(sess, saver, summary_writer, model_dir,
                                 model_name, step):
    # Save the model checkpoint
    logger.info('Saving variables')
    start_time = time.time()
    checkpoint_path = os.path.join(model_dir, 'model-%s.ckpt' % model_name)
    saver.save(sess, checkpoint_path, global_step=step, write_meta_graph=False)
    save_time_variables = time.time() - start_time
    logger.info('Variables saved in %.2f seconds' % save_time_variables)
    metagraph_filename = os.path.join(model_dir, 'model-%s.meta' % model_name)
    save_time_metagraph = 0
    if not os.path.exists(metagraph_filename):
        logger.info('Saving metagraph')
        start_time = time.time()
        saver.export_meta_graph(metagraph_filename)
        save_time_metagraph = time.time() - start_time
        logger.info('Metagraph saved in %.2f seconds' % save_time_metagraph)
    summary = tf.Summary()
    #pylint: disable=maybe-no-member
    summary.value.add(tag='time/save_variables',
                      simple_value=save_time_variables)
    summary.value.add(tag='time/save_metagraph',
                      simple_value=save_time_metagraph)
    summary_writer.add_summary(summary, step)
示例#18
0
文件: db.py 项目: lucays/ProxyPool
 async def proxy_test(self, app, session, proxy):
     real_proxy = 'http://' + proxy
     try:
         async with session.get(TEST_URL,
                                headers=HEADERS,
                                proxy=real_proxy,
                                timeout=TIMEOUT) as resp:
             if resp.status == 200:
                 logger.info(f'{proxy} test useful')
                 await app['redis_client'].score2max(proxy)
                 self.sucess_count += 1
             else:
                 logger.info(f'{proxy} status code error: {resp.status}')
                 await app['redis_client'].decrease(proxy)
             await asyncio.sleep(0.5)
     except Exception as e:
         logger.info(f'{proxy} test invalid')
         await app['redis_client'].decrease(proxy)
示例#19
0
    def start(self):
        while self._is_alive:
            try:
                if self._domain_id is None:
                    self.get_domain()
                if len(self._record_dict) == 0:
                    self.get_record()

                ip = self.get_ip()
                logger.info("LOCAL IP ADDR:" + ip)
                if self._current_ip != ip:
                    for sub_domain, record in self._record_dict.items():
                        if ip == record['value']:
                            continue
                        if self.ddns(ip, record['id'], sub_domain):
                            self._current_ip = ip
                            logger.info("[DNSPOD]CHANGE %s DDNS IP [%s --> %s]" % (sub_domain, current_ip, ip))
                        else:
                            logger.info("[DNSPOD]REFRESH DDNS FAIL")
            except Exception as e:
                logger.error(e)
            time.sleep(30)
示例#20
0
    async def fetch(session, url, proxy):

        # setting a timeout for bad proxies
        timeout = aiohttp.ClientTimeout(total=5)

        # setting 5 retries in case of proxy fail
        # sometimes the url (due to proxies) is unreachable
        for i in range(5):
            logger.info(f'Gathering single async request: {i}')
            try:
                async with session.get(url,
                                       headers=headers,
                                       proxy=proxy,
                                       timeout=timeout) as response:
                    logger.info(
                        f'Gathering single async request: successfully received a response'
                    )
                    return await response.text()
            except Exception as e:
                # TODO: setting to logger.exception breaks everything !?
                # by not continuing the for loop and stopping just before the below line
                logger.info(f'Single async requests exception: {repr(e)}')
示例#21
0
    def start(self):
        while self._is_alive:
            try:
                if self._domain_id is None:
                    self.get_domain()
                if len(self._record_dict) == 0:
                    self.get_record()

                ip = self.get_ip()
                logger.info("LOCAL IP ADDR:" + ip)
                if self._current_ip != ip:
                    for sub_domain, record in self._record_dict.items():
                        if ip == record['value']:
                            continue
                        if self.ddns(ip, record['id'], sub_domain):
                            self._current_ip = ip
                            logger.info(
                                "[DNSPOD]CHANGE %s DDNS IP [%s --> %s]" %
                                (sub_domain, current_ip, ip))
                        else:
                            logger.info("[DNSPOD]REFRESH DDNS FAIL")
            except Exception as e:
                logger.error(e)
            time.sleep(30)
示例#22
0
def main(setType, nn, a=None, b=None):
    if (setType == 'train'):
        folders = glob.glob('crowdai_train/crowdai/*')  # Get folder names
        parts = [Path(folder).parts[2] for folder in folders]
        classes = [
            re.search(r'c_(.*)', part, re.I | re.M).group(1) for part in parts
        ]  #Get the classes

        train_data = dict()
        train_data['image'] = []
        train_data['label'] = []
        train_data['location'] = []

        logger.info('Creating train arrays for {} ...'.format(nn))

        if (nn == 'alexnet'):
            dim = 227
        elif (nn == 'vgg16'):
            dim = 224

        j = 0  # Classes index
        while (j < len(classes)):
            lbl = classes[j]
            logger.info('Opening folder: {} ...'.format(lbl))
            folder = folders[j]

            if ((a is None) & (b is None)):
                imgs = glob.glob(folder + '/*.jpg')
            else:
                imgs = glob.glob(folder + '/*.jpg')[a:b]  #b - a pics

            for img in imgs:
                logger.info('Creating data point {} ...'.format(img))

                rd = cv2.imread(img)
                arr = cv2.resize(rd, (dim, dim))

                arr = arr.astype('uint64')

                train_data['image'].append(arr)
                train_data['label'].append(lbl)
                train_data['location'].append(folder)

                logger.info('Finished creating data point {}.'.format(img))

            j = j + 1

        logger.info('Finished creating arrays. Writing to DF...')
        np.set_printoptions(threshold=np.inf)
        df = pd.DataFrame.from_dict(train_data, orient='columns')
        logger.info('Finished writing to DF.')

        return df

# Deal with test data
    elif (setType == 'test'):

        test_data = dict()
        test_data['image'] = []

        imgs = glob.glob('crowdai_test/crowdai/*.jpg')

        logger.info('Creating test arrays...')

        if (nn == 'alexnet'):
            dim = 227
        elif (nn == 'vgg16'):
            dim = 224

        for img in imgs:
            logger.info('Creating data point {} ...'.format(img))

            rd = cv2.imread(img)
            arr = cv2.resize(rd, (dim, dim))

            arr = arr.astype('uint64')

            test_data['image'].append(arr)
            logger.info('Finished creating data point {} ...'.format(img))

        logger.info('Finished creating arrays. Writing to DF...')
        np.set_printoptions(threshold=np.inf)
        df = pd.DataFrame.from_dict(train_data, orient='columns')
        logger.info('Finished writing to DF.')

        return df
示例#23
0
def run(args):
    logger.info('Read data:')
    # get_data: 使用h5py进行数据载入,需要改成tensorflow中的内容
    train_A, train_B, test_A, test_B = get_data(args.task, args.image_size)

    logger.info('Build graph:')
    model = CycleGAN(args)

    # 获取需要存储的变量
    variables_to_save = tf.global_variables()
    init_op = tf.variables_initializer(variables_to_save)
    init_all_op = tf.global_variables_initializer()
    # var_list不用了吗
    saver = FastSaver(variables_to_save)

    logger.info('Trainable vars:')
    # .....获取了参数列表然后写入了log
    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 tf.get_variable_scope().name)
    for v in var_list:
        logger.info('  %s %s', v.name, v.get_shape())
    # 这个地方比价重要!
    if args.load_model != '':
        model_name = args.load_model
    else:
        model_name = '{}_{}'.format(
            args.task,
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    logdir = './logs'
    makedirs(logdir)
    logdir = os.path.join(logdir, model_name)
    logger.info('Events directory: %s', logdir)
    summary_writer = tf.summary.FileWriter(logdir)

    def init_fn(sess):
        logger.info('Initializing all parameters.')
        sess.run(init_all_op)

    sv = tf.train.Supervisor(
        is_chief=True,
        logdir=logdir,
        saver=saver,
        summary_op=None,
        init_op=init_op,
        init_fn=init_fn,
        summary_writer=summary_writer,
        ready_op=tf.report_uninitialized_variables(variables_to_save),
        global_step=model.global_step,
        save_model_secs=300,
        save_summaries_secs=30)

    if args.train:
        logger.info("Starting training session.")
        with sv.managed_session() as sess:
            model.train(sess, summary_writer, train_A, train_B)

    logger.info("Starting testing session.")
    with sv.managed_session() as sess:
        base_dir = os.path.join('results', model_name)
        makedirs(base_dir)
        model.test(sess, test_A, test_B, base_dir)
示例#24
0
 def init_fn(sess):
     logger.info('Initializing all parameters.')
     sess.run(init_all_op)
示例#25
0
	"""
	# Training in batches because of the dataset size
	# p = 0 
	# q = 20
	# iteration = 0
	# top = 125 # The total pics in the class with fewest images + 1

	theModel = sys.argv[1]

	df = preprocessing.main('train', theModel, None, None) #Read all data

	X = np.array(df['image'].tolist()) # Generate array of arrays for X, and array of vectors for y
	df['vector_labels'] = pd.get_dummies(df['label']).values.tolist()
	Y = np.array(df['vector_labels'].tolist())

	logger.info('X.shape: {}'.format(X.shape))
	logger.info('Y.shape: {}'.format(Y.shape))
	logger.info('Starting training...')

	train(X, Y, iteration, theModel)

	# while (p < top):
	# 	logger.info('Reading DF...')
	# 	df = preprocessing.main('train', theModel, p, q) #Let's do (q - p) pics from all classes per iter

	# 	X = np.array(df['image'].tolist()) # Generate array of arrays for X, and array of vectors for y
	# 	df['vector_labels'] = pd.get_dummies(df['label']).values.tolist()
	# 	Y = np.array(df['vector_labels'].tolist())

	# 	logger.info('X.shape: {}'.format(X.shape))
	# 	logger.info('Y.shape: {}'.format(Y.shape))
示例#26
0
if __name__ == '__main__':
	"""
	Due to RAM constraints, I'm reading the images in batches of 10 i.e. 10 images per class per iteration.

	If access to GPU or a massive RAM, you can: 
		1. Set p = None and q = None
		2. Set p = 0 and q = (n + 1) where the n is the number of pictures to read per iteration
			Make sure to update p and q by replacing 10 with n
	"""
	# Training in batches because of the dataset size
	p = 0 
	q = 21
	iteration = 0
	top = 125 # The total pics in the class with fewest images + 1

	logger.info('Reading DF...')
	theModel = sys.argv[1]

	if((p is None) || (q is None)):
		df = preprocessing.main('train', theModel, None, None) #Read all data

		X = np.array(df['image'].tolist()) # Generate array of arrays for X, and array of vectors for y
		df['vector_labels'] = pd.get_dummies(df['label']).values.tolist()
		Y = np.array(df['vector_labels'].tolist())

		logger.info('X.shape: {}'.format(X.shape))
		logger.info('Y.shape: {}'.format(Y.shape))
		logger.info('Starting training...')

		train(X,  Y, iteration, theModel)
	
示例#27
0
def train(X, Y, iteration, whichOne):
	XTrain, XTest, yTrain, yTest = train_test_split(X, Y, test_size=0.33, random_state=42)

	if(whichOne == 'alexnet'):
		if(os.path.exists('saved_models/agrix_alexnet.h5')):
			logger.info('Reading alexnet from file. Time to improve!!')
			model = load_model('saved_models/agrix_alexnet.h5')
		else:
			logger.info('Instantiating model for the first time')
			model = model_defs.alexnet()
			
	elif(whichOne == 'vgg16'):
		if(os.path.exists('saved_models/agrix_vgg16.h5')):
			logger.info('Reading vgg16 from file. Time to improve!!')
			model = load_model('saved_models/agrix_vgg16.h5')
		else:
			logger.info('Instantiating model for the first time')
			model = model_defs.vgg16()


	# Run model
	model.fit(XTrain, yTrain, batch_size = 64, epochs = 100, validation_split = 0.33, shuffle = True, verbose = 1)
	scores = model.evaluate(XTest, yTest, batch_size = 64, verbose = 1)
	logger.info("VALIDATION SCORE: {}: {}%".format(model.metrics_names[1], scores[1] * 100))

	if(whichOne == 'alexnet'):
		model.save('saved_models/agrix_alexnet.h5')
		logger.info('Saved alexnet to file.')
	elif(whichOne == 'vgg16'):
		model.save('saved_models/agrix_vgg16.h5')
		logger.info('Saved vgg16 to file.')
示例#28
0
def get_google_search_response(person_name, exact_match, proxies,
                               country_code):

    # set params
    params = {'as_epq' if exact_match else 'q': person_name.encode('utf8')}

    # make sure to set google search country code because
    # when using proxies the google results will depend on
    # the random country the proxy is located at and
    # the results will differ with every random proxy call

    # also keep in mind that including this parameter (as well as others, likely)
    # will increase the prob of triggering bot detection
    # so using this without a proxy will quickly result
    # in google banning the ip address and asking for recaptcha

    # set lr and cr params, maybe both of the will result
    # in actually simulating a search from specified country
    # source: https://github.com/MarioVilas/googlesearch/blob/master/googlesearch/__init__.py

    # as of 2021 the request triggers some kind of google protection
    # by not rendering the full page in html. Depending on the exact params set
    # in the html returned it says "Jei per kelias sekundes nebūsite nukreipti, <...>"
    # or just returns a different html structure that does not include div slim_appbar.
    # div slim_appbar contains the number of total results and we want to parse it.
    #
    # For future reference, the following does not help:
    # following the url provided together with "if you are not redirected within a few".
    # fixing proxy location (in private.py) e.g. US only.
    # do not setting cr lr params.
    if proxies:
        pass
        # params['cr'] = 'us'
        # params['lr'] = 'lang_' + 'us'
    if country_code:
        pass
        # params['cr'] = country_code
        # params['lr'] = 'lang_' + country_code

    # set headers - this is important!!!
    # if headers are not set google does not return
    # the number of search results and none of the divs
    # responsible for storing number of results are there.
    # Basically, the structure of html is totally different.
    # IMPORTANT: the above holds for requests sent directly
    # IMPORTANT: and for requests send through a proxy.

    # UserAgent() is a heroku app that sometimes fails
    # lets save a list of browsers for headers locally,
    # so that we don't have to call the heroku app
    # again and again every time.
    # Randomising from locally stored browsers does
    # increase the rate of captcha and invalid responses
    # For now lets fall back to UserAgent and investigate
    # the reasons later on.
    headers = {'User-Agent': UserAgent().random}
    # headers = {'User-Agent': random.choice(browsers)}

    # make the request
    url = 'https://www.google.com/search'
    response = requests_retry_session().get(url,
                                            params=params,
                                            headers=headers,
                                            proxies=proxies)

    # if recaptcha in the response, the client that sent the request
    # is blacklisted so lets return False
    if 'https://www.google.com/recaptcha/api.js' in response.text:
        logger.info('Received Captcha request (google search)')
        return False

    logger.info('Received a valid response (google search)')
    return response
示例#29
0
def main(args):

    network = importlib.import_module(args.model_def)

    subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')
    log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir)
    if not os.path.isdir(
            log_dir):  # Create the log directory if it doesn't exist
        os.makedirs(log_dir)
    model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir)

    if not os.path.isdir(
            model_dir):  # Create the model directory if it doesn't exist
        os.makedirs(model_dir)

    model_dir_plus = os.path.join(
        os.path.expanduser(args.models_plus_base_dir), subdir)
    if not os.path.isdir(
            model_dir_plus):  # Create the model directory if it doesn't exist
        os.makedirs(model_dir_plus)

    # Write arguments to a text file
    facenet.write_arguments_to_file(args, os.path.join(log_dir,
                                                       'arguments.txt'))

    # Store some git revision info in a text file in the log directory
    src_path, _ = os.path.split(os.path.realpath(__file__))
    facenet.store_revision_info(src_path, log_dir, ' '.join(sys.argv))

    np.random.seed(seed=args.seed)
    train_set_ID = facenet.get_dataset(args.data_dir_ID)
    train_set_camera = facenet.get_dataset(args.data_dir_camera)

    logger.info('Model directory: %s' % model_dir)
    logger.info('Log directory: %s' % log_dir)
    if args.pretrained_model:
        logger.info('Pre-trained model: %s' %
                    os.path.expanduser(args.pretrained_model))

    if args.lfw_dir:
        logger.info('LFW directory: %s' % args.lfw_dir)
        # Read the file containing the pairs used for testing
        pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs))
        # Get the paths for the corresponding images
        lfw_paths, actual_issame = lfw.get_paths(
            os.path.expanduser(args.lfw_dir), pairs, args.lfw_file_ext)

    # associative, fengchen
    assoc = Associative(network, args)

    with tf.Graph().as_default():
        tf.set_random_seed(args.seed)
        global_step = tf.Variable(0, trainable=False)

        # Placeholder for the learning rate
        learning_rate_placeholder = tf.placeholder(tf.float32,
                                                   name='learning_rate')
        batch_size_placeholder = tf.placeholder(tf.int32, name='batch_size')
        phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train')

        image_paths_placeholder_ID = tf.placeholder(tf.string,
                                                    shape=(None, 3),
                                                    name='image_paths_ID')
        image_paths_placeholder_camera = tf.placeholder(
            tf.string, shape=(None, 3), name='image_paths_camera')
        image_paths_placeholder_valid = tf.placeholder(
            tf.string, shape=(None, 3), name='image_paths_valid')
        labels_placeholder_ID = tf.placeholder(tf.int64,
                                               shape=(None, 3),
                                               name='labels_ID')
        labels_placeholder_camera = tf.placeholder(tf.int64,
                                                   shape=(None, 3),
                                                   name='labels_camera')
        labels_placeholder_valid = tf.placeholder(tf.int64,
                                                  shape=(None, 3),
                                                  name='labels_valid')
        input_queue_ID = data_flow_ops.FIFOQueue(capacity=100000,
                                                 dtypes=[tf.string, tf.int64],
                                                 shapes=[(3, ), (3, )],
                                                 shared_name=None,
                                                 name=None)
        input_queue_camera = data_flow_ops.FIFOQueue(
            capacity=100000,
            dtypes=[tf.string, tf.int64],
            shapes=[(3, ), (3, )],
            shared_name=None,
            name=None)
        input_queue_valid = data_flow_ops.FIFOQueue(
            capacity=100000,
            dtypes=[tf.string, tf.int64],
            shapes=[(3, ), (3, )],
            shared_name=None,
            name=None)
        enqueue_op_ID = input_queue_ID.enqueue_many(
            [image_paths_placeholder_ID, labels_placeholder_ID])
        enqueue_op_camera = input_queue_camera.enqueue_many(
            [image_paths_placeholder_camera, labels_placeholder_camera])
        enqueue_op_valid = input_queue_valid.enqueue_many(
            [image_paths_placeholder_valid, labels_placeholder_valid])
        nrof_preprocess_threads = 4

        images_and_labels_ID = []
        images_and_labels_camera = []
        images_and_labels_valid = []
        for _ in range(nrof_preprocess_threads):
            filenames, label = input_queue_ID.dequeue()
            images = []
            for filename in tf.unstack(filenames):
                file_contents = tf.read_file(filename)
                image = tf.image.decode_image(file_contents, channels=3)

                if args.random_crop:
                    image = tf.random_crop(
                        image, [args.image_size, args.image_size, 3])
                else:
                    image = tf.image.resize_image_with_crop_or_pad(
                        image, args.image_size, args.image_size)
                if args.random_flip:
                    image = tf.image.random_flip_left_right(image)

                #pylint: disable=no-member
                image.set_shape((args.image_size, args.image_size, 3))
                images.append(tf.image.per_image_standardization(image))
            images_and_labels_ID.append([images, label])

        for _ in range(nrof_preprocess_threads):
            filenames, label = input_queue_camera.dequeue()

            images = []
            for filename in tf.unstack(filenames):
                file_contents = tf.read_file(filename)
                image = tf.image.decode_image(file_contents, channels=3)

                if args.random_crop:
                    image = tf.random_crop(
                        image, [args.image_size, args.image_size, 3])
                else:
                    image = tf.image.resize_image_with_crop_or_pad(
                        image, args.image_size, args.image_size)
                if args.random_flip:
                    image = tf.image.random_flip_left_right(image)

                # pylint: disable=no-member
                image.set_shape((args.image_size, args.image_size, 3))
                images.append(tf.image.per_image_standardization(image))
            images_and_labels_camera.append([images, label])

        for _ in range(nrof_preprocess_threads):
            filenames, label = input_queue_valid.dequeue()

            images = []
            for filename in tf.unstack(filenames):
                file_contents = tf.read_file(filename)
                image = tf.image.decode_image(file_contents, channels=3)

                if args.random_crop:
                    image = tf.random_crop(
                        image, [args.image_size, args.image_size, 3])
                else:
                    image = tf.image.resize_image_with_crop_or_pad(
                        image, args.image_size, args.image_size)
                if args.random_flip:
                    image = tf.image.random_flip_left_right(image)

                # pylint: disable=no-member
                image.set_shape((args.image_size, args.image_size, 3))
                images.append(tf.image.per_image_standardization(image))
            images_and_labels_valid.append([images, label])

        image_batch_ID, labels_batch_ID = tf.train.batch_join(
            images_and_labels_ID,
            batch_size=batch_size_placeholder,
            shapes=[(args.image_size, args.image_size, 3), ()],
            enqueue_many=True,
            capacity=4 * nrof_preprocess_threads * args.batch_size,
            allow_smaller_final_batch=True)
        image_batch_ID = tf.identity(image_batch_ID, 'image_batch_ID')
        image_batch_ID = tf.identity(image_batch_ID, 'input_ID')
        labels_batch_ID = tf.identity(labels_batch_ID, 'label_batch_ID')

        image_batch_camera, labels_batch_camera = tf.train.batch_join(
            images_and_labels_camera,
            batch_size=batch_size_placeholder,
            shapes=[(args.image_size, args.image_size, 3), ()],
            enqueue_many=True,
            capacity=4 * nrof_preprocess_threads * args.batch_size,
            allow_smaller_final_batch=True)
        image_batch_camera = tf.identity(image_batch_camera,
                                         'image_batch_camera')
        image_batch_camera = tf.identity(image_batch_camera, 'input_camera')
        labels_batch_camera = tf.identity(labels_batch_camera,
                                          'label_batch_camera')

        image_batch_valid, labels_batch_valid = tf.train.batch_join(
            images_and_labels_valid,
            batch_size=batch_size_placeholder,
            shapes=[(args.image_size, args.image_size, 3), ()],
            enqueue_many=True,
            capacity=4 * nrof_preprocess_threads * args.batch_size,
            allow_smaller_final_batch=True)

        image_batch_valid = tf.identity(image_batch_valid, 'image_batch_valid')
        labels_batch_valid = tf.identity(labels_batch_valid,
                                         'label_batch_valid')

        # Build the inference graph
        prelogits_ID, _, _, _, _ = network.inference(
            image_batch_ID,
            args.keep_probability,
            phase_train=phase_train_placeholder,
            bottleneck_layer_size=args.embedding_size,
            weight_decay=args.weight_decay)
        prelogits_camera, _, _, _, _ = network.inference(
            image_batch_camera,
            args.keep_probability,
            phase_train=phase_train_placeholder,
            bottleneck_layer_size=args.embedding_size,
            weight_decay=args.weight_decay)
        prelogits_valid, _, _, _, _ = network.inference(
            image_batch_valid,
            args.keep_probability,
            phase_train=phase_train_placeholder,
            bottleneck_layer_size=args.embedding_size,
            weight_decay=args.weight_decay)

        embeddings_ID = tf.nn.l2_normalize(prelogits_ID,
                                           1,
                                           1e-10,
                                           name='embeddings_ID')
        embeddings_camera = tf.nn.l2_normalize(prelogits_camera,
                                               1,
                                               1e-10,
                                               name='embeddings_camera')
        embeddings_valid = tf.nn.l2_normalize(prelogits_valid,
                                              1,
                                              1e-10,
                                              name='embeddings_valid')

        # Split embeddings into anchor, positive and negative and calculate triplet loss
        anchor_ID, positive_ID, negative_ID = tf.unstack(
            tf.reshape(embeddings_ID, [-1, 3, args.embedding_size]), 3, 1)
        triplet_loss_ID = facenet.triplet_loss(anchor_ID, positive_ID,
                                               negative_ID, args.alpha)

        anchor_camera, positive_camera, negative_camera = tf.unstack(
            tf.reshape(embeddings_camera, [-1, 3, args.embedding_size]), 3, 1)
        triplet_loss_camera = facenet.triplet_loss(anchor_camera,
                                                   positive_camera,
                                                   negative_camera, args.alpha)

        images_mmd_ID, images_mmd_camera, _, _ = assoc.get_image_and_label_dann(
        )
        feature_map3_ID, _, _, _, _ = network.inference(
            images_mmd_ID,
            args.keep_probability,
            phase_train=phase_train_placeholder,
            bottleneck_layer_size=args.embedding_size,
            weight_decay=args.weight_decay)
        feature_map3_camera, _, _, _, _ = network.inference(
            images_mmd_camera,
            args.keep_probability,
            phase_train=phase_train_placeholder,
            bottleneck_layer_size=args.embedding_size,
            weight_decay=args.weight_decay)

        saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10)

        # feature_map3_ID = tf.nn.l2_normalize(feature_map3_ID, 1, 1e-10, name='feature_map3_ID')
        # feature_map3_camera = tf.nn.l2_normalize(feature_map3_camera, 1, 1e-10, name='feature_map3_camera')
        dann_loss = 0.1 * losses.dann_loss(feature_map3_ID,
                                           feature_map3_camera, 1)
        # logger.debug("feature_map3_ID: %s, feature_map3_camera: %s" % (feature_map3_ID.get_shape(), feature_map3_camera.get_shape()))

        learning_rate = tf.train.exponential_decay(
            learning_rate_placeholder,
            global_step,
            args.learning_rate_decay_epochs * args.epoch_size,
            args.learning_rate_decay_factor,
            staircase=True)
        tf.summary.scalar('learning_rate', learning_rate)

        # # associative, fengchen
        # associative_loss = assoc.loss() * 10

        # Calculate the total losses
        regularization_losses = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)

        triplet_loss = tf.add_n([triplet_loss_ID] + [triplet_loss_camera] +
                                regularization_losses,
                                name='triplet_loss')

        # associative, fengchen
        loss_total = tf.add_n([triplet_loss_ID] + [triplet_loss_camera] +
                              [dann_loss] + regularization_losses,
                              name='loss_total')

        saver_plus = tf.train.Saver(tf.trainable_variables(), max_to_keep=10)

        # Build a Graph that trains the model with one batch of examples and updates the model parameters
        train_op = facenet.train(loss_total, global_step, args.optimizer,
                                 learning_rate, args.moving_average_decay,
                                 tf.global_variables())
        train_op_triplet = facenet.train(triplet_loss, global_step,
                                         args.optimizer, learning_rate,
                                         args.moving_average_decay,
                                         tf.global_variables())

        # Start running operations on the Graph.
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=args.gpu_memory_fraction)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

        # Initialize variables
        sess.run(tf.global_variables_initializer(),
                 feed_dict={phase_train_placeholder: True})
        sess.run(tf.local_variables_initializer(),
                 feed_dict={phase_train_placeholder: True})

        summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
        coord = tf.train.Coordinator()
        tf.train.start_queue_runners(coord=coord, sess=sess)

        with sess.as_default():

            if args.pretrained_model:
                logger.info('Restoring pretrained model: %s' %
                            args.pretrained_model)
                # saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=3)
                saver.restore(sess, os.path.expanduser(args.pretrained_model))
            # saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)

            # Training and validation loop
            epoch = 0
            while epoch < args.max_nrof_epochs:
                step = sess.run(global_step, feed_dict=None)
                epoch = step // args.epoch_size
                # Train for one epoch
                train(args, sess, train_set_ID, train_set_camera, epoch,
                      image_paths_placeholder_ID,
                      image_paths_placeholder_camera, labels_placeholder_ID,
                      labels_placeholder_camera, labels_batch_ID,
                      labels_batch_camera, batch_size_placeholder,
                      learning_rate_placeholder, phase_train_placeholder,
                      enqueue_op_ID, enqueue_op_camera, global_step,
                      embeddings_ID, embeddings_camera, triplet_loss,
                      loss_total, triplet_loss_ID, triplet_loss_camera,
                      dann_loss, regularization_losses, train_op,
                      train_op_triplet, summary_writer,
                      args.learning_rate_schedule_file, args.embedding_size)

                # Save variables and the metagraph if it doesn't exist already
                save_variables_and_metagraph(sess, saver, summary_writer,
                                             model_dir, subdir, step)
                save_variables_and_metagraph(sess, saver_plus, summary_writer,
                                             model_dir_plus, subdir, step)

                # Evaluate on LFW
                if args.lfw_dir:
                    evaluate(sess, lfw_paths, embeddings_valid,
                             labels_batch_valid, image_paths_placeholder_valid,
                             labels_placeholder_valid, batch_size_placeholder,
                             learning_rate_placeholder,
                             phase_train_placeholder, enqueue_op_valid,
                             actual_issame, args.batch_size,
                             args.lfw_nrof_folds, log_dir, step,
                             summary_writer, args.embedding_size)

    return model_dir
示例#30
0
 def wrapper(*args, **kwargs):
     for i in range(max_tries):
         logger.info(f'Trying {fn.__name__} {i}')
         output = fn(*args, **kwargs)
         if output:
             return output
示例#31
0
def google_translate(google_data, proxies):
    # https://github.com/ssut/py-googletrans
    # TODO: do in one batch by giving an array

    # collect items to translate
    snippets = [item['snippet'] for item in google_data['items']]
    titles = [item['title'] for item in google_data['items']]

    # okay, this is pretty ugly but here is the idea:
    # I want to combine whole text into one string to send only one request to google translate
    # due to this snippets and titles are combined with special separator *||* (with hope that it will not break)
    # later on all is reorganized back to snippets and titles
    # text_to_translate = titles + snippets
    text_to_translate = [
        ' ||| '.join([
            title + ' ||| ' + snippet
            for title, snippet in zip(titles, snippets)
        ])
    ]

    # clear non alpha num
    # e.g ☀ throws error in translator
    # re is fastest
    # https://stackoverflow.com/questions/1276764/stripping-everything-but-alphanumeric-chars-from-a-string-in-python
    # keep . and , and add else if needed
    text_to_translate = [
        re.sub(r"[^\w.,|']", ' ', text) for text in text_to_translate
    ]

    # translate
    # Translator creates its own requests session, so lets
    # modify it to retry on fail connection / other errors
    # else, the code will fail at this point with broken conn
    translator = Translator(proxies=proxies, timeout=5)

    # the following would overwrite a couple of things
    # also would force token acquisition to be done via proxy
    # none of this is good, so lets not do this and use the default session
    # translator.session = requests_retry_session(retries=0, timeout=5)

    # temp fix: https://github.com/ssut/py-googletrans/issues/234
    # from the looks of it, it seems the issue will be soon fixed
    # also this is good for cases when proxy connection fails
    for _ in range(5):
        try:
            translated = [
                item.text
                for item in translator.translate(text_to_translate, dest='en')
            ]
            logger.info('Successfully google translated text')
            break
        except Exception as e:
            logger.info(f'google translate error {_}')
            translator = Translator(proxies=proxies, timeout=5)

    # ungroup and split back to snippets and titles
    titles_translated = translated[0].split('|||')[0::2]
    snippets_translated = translated[0].split('|||')[1::2]

    # assign
    for item, snippet, title in zip(google_data['items'], snippets_translated,
                                    titles_translated):
        item['snippet'] = snippet
        item['title'] = title

    return google_data