Пример #1
0
    def __init__(self, url, query):
        self.url = None
        self.raw_html = None
        self.crawl_date = None
        self.status = None
        self.error_type = None
        self.status_code = None
        self.type = "page"

        self.url = url
        self.query = query

        self.crawl_date = self.start_date = date.today()

        self.unwanted_extensions = [
            'css',
            'js',
            'gif',
            'asp',
            'GIF',
            'jpeg',
            'JPEG',
            'jpg',
            'JPG',
            'pdf',
            'PDF',
            'ico',
            'ICO',
            'png',
            'PNG',
            'dtd',
            'DTD',
            'mp4',
            'mp3',
            'mov',
            'zip',
            'bz2',
            'gz',
        ]
        self.adblock = Filter(file('easylist.txt'))
        self.create()
Пример #2
0
def handle_filters(filter_list):
    filter_dict = {}
    for idx, filter in enumerate(filter_list):
        filter_ins = Filter(f_id=filter['id'], f_name=filter['name'], f_type=filter['type'], f_update=filter['updated'])
        if filter['type'] == "SEARCH_AND_REPLACE":
            filter_ins.details = filter['searchAndReplaceDetails']
        elif filter['type'] == "INCLUDE":
            filter_ins.details = filter["includeDetails"]
        elif filter['type'] == "EXCLUDE":
            filter_ins.details = filter["excludeDetails"]
        elif filter['type'] == "LOWERCASE":
            filter_ins.details = filter["lowercaseDetails"]
        elif filter['type'] == "UPPERCASE":
            filter_ins.details = filter["uppercaseDetails"]
        elif filter['type'] == "ADVANCED":
            filter_ins.details = filter["advancedDetails"]
        else:
            filter_ins.details = {'key': 'value'}
        filter_dict["filter_{}".format(idx)] = filter_ins

    return filter_dict
Пример #3
0
	def __init__(self, url, query):
		self.url = None
		self.raw_html = None
		self.crawl_date = None
		self.status = None
		self.error_type = None
		self.status_code = None
		self.type = "page"

		self.url = url
		self.query = query
		
		self.crawl_date = self.start_date = date.today()

		self.unwanted_extensions = ['css','js','gif','asp', 'GIF','jpeg','JPEG','jpg','JPG','pdf','PDF','ico','ICO','png','PNG','dtd','DTD', 'mp4', 'mp3', 'mov', 'zip','bz2', 'gz', ]	
		self.adblock = Filter(file('easylist.txt'))
		self.create()
Пример #4
0
class Page(object):
	def __init__(self, url, query):
		self.url = None
		self.raw_html = None
		self.crawl_date = None
		self.status = None
		self.error_type = None
		self.status_code = None
		self.type = "page"

		self.url = url
		self.query = query
		
		self.crawl_date = self.start_date = date.today()

		self.unwanted_extensions = ['css','js','gif','asp', 'GIF','jpeg','JPEG','jpg','JPG','pdf','PDF','ico','ICO','png','PNG','dtd','DTD', 'mp4', 'mp3', 'mov', 'zip','bz2', 'gz', ]	
		self.adblock = Filter(file('easylist.txt'))
		self.create()

	def create(self):	
		if self.check() and self.request() and self.control():
			return Article()
		else:
			return self.bad_status()

	def check(self):
		'''Bool: check the format of the next url compared to curr url'''
		if self.url is None or len(self.url) <= 1 or self.url == "\n":
			self.error_type = "Url is empty"
			self.status_code = 204
			self.status = False
			return False
		elif (( self.url.split('.')[-1] in self.unwanted_extensions ) and ( len( self.adblock.match(self.url) ) > 0 ) ):
			self.error_type="Url has not a proprer extension or page is an advertissement"
			self.status_code = 204
			self.status = False
			return False
		else:
			self.status = True
			return True
		
	def request(self):
		'''Bool request a webpage: return boolean and update src'''
		try:
			requests.adapters.DEFAULT_RETRIES = 2
			user_agents = [u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1', u'Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2', u'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0', u'Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00']
			headers = {'User-Agent': choice(user_agents),}
			proxies = {"https":"77.120.126.35:3128", "https":'88.165.134.24:3128', }
			try:
				self.req = requests.get((self.url), headers = headers,allow_redirects=True, proxies=None, timeout=5)
				
				try:
					
					self.raw_html = self.req.text
					self.status = True
					return True
				except Exception, e:
					
					self.error_type = "Request answer was not understood %s" %e
					self.status_code = 400
					self.status = False
					return False
				else:
					self.error_type = "Not relevant"
					self.status_code = 0
					self.status = True
					return False
Пример #5
0
    def build_one_dataset(self, curr_data):
        # Unpack the data related info, num_examples is not used
        curr_data_path, _, extra_tensors = curr_data

        # Dictionary with keys being source, and values being directories
        self.source_paths = { 
                source: os.path.join(curr_data_path, source) \
                for source in self.sources }

        # load filters, add that to source_paths
        if self.filter_rule:
            self.filter = Filter(self.filter_rule)
            for f in self.filter.keys:
                self.source_paths[f] = os.path.join(curr_data_path, f)
                if f not in self.all_sources:
                    self.all_sources.append(f)
        else:
            self.filter = None

        # load metas
        self.meta_dict = self.parse_standard_tfmeta(self.source_paths)

        # Get tfr filenames
        source_lists = {
                source: self.get_tfr_filenames(
                    self.source_paths[source], 
                    file_pattern=self.file_pattern) \
                for source in self.source_paths}

        # This shuffle needs to be False to keep the order of every attribute
        # the same
        file_datasets = {
                source: tf.data.Dataset.list_files(curr_files, shuffle=False) \
                for source, curr_files in source_lists.items()}

        if self.is_training:
            # Shuffle file names using the same shuffle_seed
            file_datasets = {
                    source: curr_dataset.shuffle(
                        buffer_size=len(source_lists.values()[0]), 
                        seed=self.shuffle_seed).repeat() \
                    for source,curr_dataset in file_datasets.items()}

        # Create dataset for both
	def _fetch_dataset(filename):
	    buffer_size = 8 * 1024 * 1024     # 8 MiB per file
	    dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size)
	    return dataset

        each_dataset = {
                source: curr_dataset.apply(
                    tf.contrib.data.parallel_interleave(
                        _fetch_dataset, 
                        cycle_length=1, 
                        sloppy=False)) \
                for source,curr_dataset in file_datasets.items()
                }

        # Decode raw first before zip
        each_dataset = {
                source: curr_dataset.map(
                    lambda x: self.postproc_each(x, source),
                    num_parallel_calls=self.map_pcall_num,
                    ) \
                for source, curr_dataset in each_dataset.items()
                }

        # Zip, repeat, batch
        zip_dataset = tf.data.Dataset.zip(each_dataset)
        zip_dataset = zip_dataset.repeat()
        zip_dataset = zip_dataset.batch(self.enqueue_batch_size)

        # Set shape (first dimension to be batchsize)
        zip_dataset = zip_dataset.map(
                lambda x: {
                    key: self.set_data_shape(value) 
                    for key,value in x.items()}, 
                num_parallel_calls=self.map_pcall_num)

        # Create sequence for each dataset
        zip_dataset = zip_dataset.map(
                lambda x: {
                    key: self.create_data_sequence(value) 
                    for key, value in x.items()}, 
                num_parallel_calls=self.map_pcall_num)

        # Add extra tensors
        def add_extra_tensors(value):
            for extra_key, extra_tensor in extra_tensors.items():
                assert extra_key not in value
                batch_size = value[value.keys()[0]].get_shape().as_list()[0]
                time = value[value.keys()[0]].get_shape().as_list()[1]
                extra_tensor = tf.constant(extra_tensor, dtype=tf.float32)
                extra_shape = extra_tensor.get_shape().as_list()
                value[extra_key] = tf.tile(
                        tf.reshape(
                            extra_tensor,
                            [1, 1] + extra_shape),
                        [batch_size, time] + [1] * len(extra_shape))
                if extra_key not in self.all_sources:
                    self.all_sources.append(extra_key)
            return value
        zip_dataset = zip_dataset.map(
                add_extra_tensors,
                num_parallel_calls=self.map_pcall_num)

        return zip_dataset
Пример #6
0
from flask import Flask, request, send_file
from utils import Filter, formatResponse, formatPredictionInput
import pickle
import sys
import pdb

filter = Filter('pythonsqlite.db')
app = Flask(__name__)
sat_model = pickle.load(open('primary_sat_model.sav', 'rb'))

@app.route("/scores")
@formatResponse
def send_scores():
    if all(arg in request.args for arg in ['score','conditional','subject']):
        return filter.byScore(request.args)
    elif 'subject' in request.args:
        return filter.bySubject(request.args['subject'])
    elif 'school' in request.args:
        return filter.bySchool(request.args['school'])

@app.route("/matrix")
@formatResponse
def send_matrix():
    res = send_file('matrix.svg', mimetype="image/svg+xml")
    return res

@app.route("/predict")
@formatResponse
def predict():
    input = formatPredictionInput(request.args)
    pdb.set_trace()
Пример #7
0
class Page(object):
    def __init__(self, url, query):
        self.url = None
        self.raw_html = None
        self.crawl_date = None
        self.status = None
        self.error_type = None
        self.status_code = None
        self.type = "page"

        self.url = url
        self.query = query

        self.crawl_date = self.start_date = date.today()

        self.unwanted_extensions = [
            'css',
            'js',
            'gif',
            'asp',
            'GIF',
            'jpeg',
            'JPEG',
            'jpg',
            'JPG',
            'pdf',
            'PDF',
            'ico',
            'ICO',
            'png',
            'PNG',
            'dtd',
            'DTD',
            'mp4',
            'mp3',
            'mov',
            'zip',
            'bz2',
            'gz',
        ]
        self.adblock = Filter(file('easylist.txt'))
        self.create()

    def create(self):
        if self.check() and self.request() and self.control():
            return Article()
        else:
            return self.bad_status()

    def check(self):
        '''Bool: check the format of the next url compared to curr url'''
        if self.url is None or len(self.url) <= 1 or self.url == "\n":
            self.error_type = "Url is empty"
            self.status_code = 204
            self.status = False
            return False
        elif ((self.url.split('.')[-1] in self.unwanted_extensions)
              and (len(self.adblock.match(self.url)) > 0)):
            self.error_type = "Url has not a proprer extension or page is an advertissement"
            self.status_code = 204
            self.status = False
            return False
        else:
            self.status = True
            return True

    def request(self):
        '''Bool request a webpage: return boolean and update src'''
        try:
            requests.adapters.DEFAULT_RETRIES = 2
            user_agents = [
                u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1',
                u'Mozilla/5.0 (Windows NT 6.1; rv:15.0) Gecko/20120716 Firefox/15.0a2',
                u'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
                u'Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00'
            ]
            headers = {
                'User-Agent': choice(user_agents),
            }
            proxies = {
                "https": "77.120.126.35:3128",
                "https": '88.165.134.24:3128',
            }
            try:
                self.req = requests.get((self.url),
                                        headers=headers,
                                        allow_redirects=True,
                                        proxies=None,
                                        timeout=5)

                try:

                    self.raw_html = self.req.text
                    self.status = True
                    return True
                except Exception, e:

                    self.error_type = "Request answer was not understood %s" % e
                    self.status_code = 400
                    self.status = False
                    return False
                else:
                    self.error_type = "Not relevant"
                    self.status_code = 0
                    self.status = True
                    return False
import matplotlib.pyplot as plt
from utils import Filter
import warnings

warnings.filterwarnings("ignore")
import warnings

warnings.filterwarnings("ignore")
# ## Impact of commits performance
file_path = '../data/dataset_round1.csv'

df = pd.read_csv(file_path)
refactorings = df.columns[5:-5]

# Consider only datapoints with a single refactoring type
f = Filter(df, refactorings)
df = df[f.singlereftype()]

# Include addition dataset (second round data collection)
df_singleref = pd.read_csv('../data/dataset_round2.csv')
df = pd.concat([df_singleref, df])
refactorings = df.columns[5:-5]

## performance relative change threshold
change_threshold = 0.01 / (10**20)


def which_refactoring(row):
    num_ref = max(row[r] for r in refactorings)
    assert (num_ref > 0)
    assert (sum(row[r] for r in refactorings) == num_ref)