예제 #1
0
class GoogleDataHelper(object):
    def __init__(self):
        self.go = GoogleSearch()

    def _get_config(self):
        pass

    def _to_dataframe(self, result, keep_raw_data=True):
        df = pd.DataFrame()

        created_at = date.today().strftime('%Y-%m-%d')
        df['source'] = ['google' for i in range(len(result))]
        df['created_at'] = [created_at for i in range(len(result))]
        df['author'] = [tup[1] for tup in result]
        df['text'] = [' | '.join([tup[0], tup[2]]) for tup in result]
        df['url'] = [tup[1] for tup in result]

        if keep_raw_data:
            df['raw_data'] = [tup for tup in result]

    def get_data(self,
                 querystring='deep learning',
                 num=10,
                 keep_raw_data=True):
        query = self.go.prune(querystring)
        self.go.doquery(query, num)
        result = self.go.showpage(num)

        df = self._to_dataframe(result, keep_raw_data)

        return df
def find_links(docs_num):
    english_words = read_all_english_words()
    # prev_search_time = datetime.now()
    links_set = set()
    search = GoogleSearch()

    while len(links_set) < docs_num:
        print 'len(links_set) is: ', len(links_set)
        num_of_words = random.randint(3, 5)
        words = [
            english_words[random.randrange(10000)] for i in range(num_of_words)
        ]
        # links_list, prev_search_time, next_url = search.new_search(words, prev_search_time)
        if not search.new_search(words, False):
            continue
        for i in range(10):
            link = search.next_link(avoid_more_searches=True)
            if not link:
                break
            if link not in links_set:
                with open(
                        '/Users/uriklarman/Development/PycharmProjects/keywords_learning/links.txt',
                        'a') as f:
                    f.write(link + '\n')
                links_set.add(link)
                if len(links_set) == docs_num:
                    break
예제 #3
0
파일: main.py 프로젝트: apanesarr/HQBot
def main():
    screen.shot()
    question_options = localocr.getText()
    wiki_search = Wiki(question_options['question'],
                       question_options['options'])
    wiki_result = wiki_search.getResults()
    google_search = GoogleSearch()
    google_search_result = google_search.getResult(
        question_options['question'], question_options['options'])
    wiki_index = wiki_result.index(max(wiki_result))
    google_index = google_search_result.index(max(google_search_result))

    print("WIKI FOUND: " + str(question_options['options'][wiki_index]) +
          "\t Frequency of Option: " + str(wiki_result[wiki_index]))
    print("GOOGLE FOUND: " + str(question_options['options'][google_index]) +
          "\t Number of Results: " + str(google_search_result[google_index]))
예제 #4
0
파일: query.py 프로젝트: vsraptor/pse
def google_search(query):
	s = GoogleSearch()
	s.search(query)
	for info in s.results() :
		print '-' * 50
		print info
		print s.excerpt(info['id'],5,50)
예제 #5
0
파일: query.py 프로젝트: wuben3125/pse
def google_search(query):
    s = GoogleSearch()
    s.search(query)
    for info in s.results():
        print('-' * 50)
        print(info)
        print(s.excerpt(info['id'], 5, 50))
예제 #6
0
파일: routes.py 프로젝트: wuben3125/pse
def search():
	google = GoogleSearch()
	bmark = BmarkSearch()

	if 'q' in request.form: # updated from if request.form.has_key('q') :
		q = request.form['q']

		if len(q) > 0 :

			try:
				bmark.search(q)
			except Exception as e :
				flash('Bmark search: ' + str(e)) # updated from flash("Google search error : " + e.message)

			try :
				google.search(q)
			except Exception as e:
				flash("Google search error : " + str(e)) # updated from flash("Google search error : " + e.message)

		else:
			flash('Interesting what will happen if you search for something rather than nothing !!')

	return render_template('pse/search.html', search=bmark, google=google, query=q)
예제 #7
0
파일: routes.py 프로젝트: vsraptor/pse
def search():
	google = GoogleSearch()
	bmark = BmarkSearch()

	if request.form.has_key('q') :
		q = request.form['q']

		if len(q) > 0 :

			try:
				bmark.search(q)
			except Exception as e :
				flash('Bmark search: ' + e.message)

			try :
				google.search(q)
			except Exception as e:
				flash("Google search error : " + e.message)

		else:
			flash('Interesting what will happen if you search for something rather than nothing !!')

	return render_template('pse/search.html', search=bmark, google=google, query=q)
예제 #8
0
 def __init__(self):
     self.go = GoogleSearch()
예제 #9
0
def Main(**kwargs):
    '''
    :param kwargs:
        start_date:
        end_date:
        keywords_file:
        output_dir:
    :return:
    '''
    start_date = kwargs[
        'start_date'] if 'start_date' in kwargs else '2017-01-01'
    end_date = kwargs['end_date'] if 'end_date' in kwargs else '2017-02-01'
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    InitLogging()

    keys_file = kwargs['keywords_file']
    key_list = GetKeyList(keys_file)
    assert len(key_list) > 0

    output_dir = kwargs['output_dir']
    mg = MagicGoogle(PROXIES)
    global logger
    searcher = GoogleSearch(logger)
    while start_date < end_date:
        """
        for key in key_list:
            q = 'www.reuters.com/article/{} {}'.format(
                start_date.strftime("%Y/%m/%d"),
                key)
            print(q)
            logger.info('info:date={}||key_word=\'{}\'\n'.format(
                start_date.strftime("%Y-%m-%d"),
                key.lower()))
            reuters_url = searcher.search(query=q)
            print(reuters_url)
            for url in reuters_url:
                if start_date.strftime('%Y%m%d') not in url:
                    continue;
                print(url)
                DownloadFromReuters(
                    os.path.join(output_dir, start_date.strftime('%Y_%m_%d')),
                    url)

            time.sleep(random.randint(60, 120))
        """
        url = 'https://www.reuters.com/resources/archive/us/{}.html'.format(
            start_date.strftime("%Y%m%d"))
        res = requests.get(url=url, proxies=PROXIES[0])
        soup = BeautifulSoup(res.text)
        ref_list = [
            h.a['href'] for h in soup.find_all("div", {'class': 'headlineMed'})
        ]
        for ref in ref_list:
            if 'videoStory' in ref:
                # filter video news
                continue
            pprint.pprint(ref)
            try:
                DownloadFromReuters(os.path.join(
                    output_dir, start_date.strftime("%Y_%m_%d")),
                                    url=ref)
            except requests.exceptions.ProxyError:
                logger.error("ProxyError||url={}".format(ref))
            # time.sleep(random.randint(1, 2))
        time.sleep(random.randint(20, 120))
        start_date += timedelta(days=1)
def validate_redirection(context):
    site = GoogleSearch(*context["browser"])
    assert 'Wikipedia' in site.title(), "Wrong redirection"
def click_on_link(context):
    google_result = GoogleSearch(*context["browser"])
    context["browser"] = google_result.click_on()
def visit_google(config, web_browser, context):
    google_home = GoogleSearch(web_browser, config)
    google_home.load()
    context["browser"] = google_home
    assert "Google" in google_home.title(), "You are not at Google"
예제 #13
0
        else:
            return sentence


if __name__ == '__main__':
    query_list = [
        'deep convolutional neural network to classify the 1.2 million high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 dif- ferent',
        '各國迫切需要達成巴黎協定目標,以控制全球暖化在高於工業化前水平2°C以內。',
        'we restricted our docking study to the 76 out of 104 cases where the protein binds to a single molecule of dsDNA',
        'and hidden unit j are on together when the feature detectors are being driven by images from the training set and',
        'To achieve its impressive performance in tasks such as speech perception or object recognition, the brain extracts multiple levels of representation from the sensory input',
        'a new learning algorithm that alleviates the problem of the potential convergence to a steady-state, named Active Hebbian Learning (AHL) is presented, validated and implemented',
        'In the past several years, a number of different language modeling improvements over simple trigram models have been found,'
    ]

    searcher = GoogleSearch()
    result_match = ResultMatch()

    for index, query in enumerate(query_list):

        print(f'=================== TestCase {index} ===================')
        print(f'query string: {query}')

        # query a string in google
        results = searcher.search(query)
        #print(results)

        # get a query result summary provided by google
        # and
        # get the url of this query result webpage
        # then
def main():
    start = GoogleSearch.millis()
    wikidataSearch = WikidataSearch('李小龍')
    wikidataSearch.process()
    print('Process time: ' + str(GoogleSearch.millis() - start) + 'ms')
def createDataset(chosen_dataset, n, random_state, skip=0):
    # Set scope variables
    regex = re.compile(
        r'^((http(s?):\/\/)?)((www\.)?)((?!(facebook|twitter|instagram|google|youtube|tumblur|itunes|linkedin|pinterest)).)*\..*$'
    )
    if chosen_dataset == 0:
        json_path = './dataset/politifact_results.json'
        dataset = './dataset/politifact_fake.csv'
    elif chosen_dataset == 1:
        json_path = './dataset/gossipcop_results.json'
        dataset = './dataset/gossipcop_fake.csv'
    else:
        sys.exit('Give correct argument')

    # Check json file
    try:
        # Remove closing last two lines from file
        f = open(json_path, "r")
        d = f.read()
        f.close()
        m = d.split("\n")
        s = "\n".join(m[:-2]) + '\n'
        f = open(json_path, "w+")
        for i in range(len(s)):
            f.write(s[i])
    except IOError:
        # Create json file
        with open(json_path, 'w') as f:
            f.write('{\n\t"data": [\n')
    finally:
        f.close()

    with open(json_path, 'a') as f:
        # Read csv and setup Google Client
        df = pd.read_csv(dataset)
        df = df.sample(n=n, random_state=random_state)
        index_array = df.index
        google = GoogleSearch()

        # Do Loop here
        for i in range(n):
            if (i < skip):
                continue

            print('\033[92m' + 'Currently checking id #', str(i + 1), 'of',
                  str(n) + '\033[0m')

            # Pick next query
            current = df.iloc[i]

            # Check url if acceptable
            final_url = current['news_url']
            url_check = current['news_url']
            if (current['news_url'] != current['news_url']):
                continue
            if not url_check or not regex.match(url_check):
                continue
            if 'web.archive.org/web/' in url_check:
                # Remove http(s?)web.archive.org/web/[0-9]{14}
                url_check = url_check.split('web.archive.org/web/')[1:]
                url_check = ''.join(url_check)[15:]
            if 'www.' in url_check:
                # Remove www.
                url_check = url_check[(url_check.find('www.') + 4):]
            url_check = ''.join(url_check)
            if not url_check or not regex.match(url_check):
                continue

            # Extract articles from google
            original_article = extract_articles(current['news_url'])
            if original_article is None:
                archieve_url = get_website_url_from_arhieve(
                    current['news_url'])
                if archieve_url is not None:
                    final_url = archieve_url
                    original_article = extract_articles(archieve_url)
                else:
                    continue

            # Check if content is empty
            if not original_article or not original_article['content']:
                continue

            # Run query and get results
            res = google.run(current['title'])

            # Extract content for one article
            extracted_articles = [extract_articles(i) for i in res]

            # Build json entry and save to file
            new_entry = dict({
                'id': str(index_array[i]),
                'original_article': {
                    "url": final_url,
                    "title": current['title'],
                    "content": original_article['content']
                },
                'extracted_articles': extracted_articles
            })
            json.dump(new_entry, f, sort_keys=False, indent=2)
            f.write(',\n')

        # Close json
        f.write('\n]\n}')
        f.close()