def main():
    fnames = []
    for p in range(1, 14):
        for g in range(1, 11):
            for r in range(1, 21):
                pgr = 'LP_data/dataset/P{}/G{}/R{}'.format(p, g, r)
                json_file = pgr + '.json'
                limg_file = pgr + '_l.png'
                rimg_file = pgr + '_r.png'
                fnames.append((json_file, limg_file, rimg_file, g))
    for i in range(50):
        hs = []
        gs = []
        cost = 0
        for json_file, limg_file, rimg_file, g in sample(fnames, 200):
            _, jd = read(json_file)
            _, limg = read(limg_file)
            _, rimg = read(rimg_file)
            start = time.time()
            h = predict(jd, limg, rimg)
            end = time.time()
            cost += end-start
            hs.append(h)
            gs.append(g-1)
        per_cost = cost / 200
        print('Per prediction cost: {:.4f}'.format(per_cost))
        print('Predictions per sec: {:.2f}'.format(1 / per_cost))
        print('accuracy: {:.2%}'.format(accuracy_score(hs, gs)))
예제 #2
0
 def get_train_data(self):
     click_header = [
         'time', 'userID', 'ip', 'impressionID', 'adID', 'position', 'url'
     ]
     click_data = utils.read(click_path, click_header)
     impression_header = [
         'time', 'userID', 'ip', 'searchID', 'impressionID', 'adID',
         'position'
     ]
     impression_data = utils.read(impression_path, impression_header)
     click_list = set(click_data['impressionID'].values)
     impression_list = impression_data['impressionID'].values
     train_y = [0] * len(impression_list)
     i = 0
     for impressionID in impression_list:
         if impressionID in click_list:
             train_y[i] = 1
         i += 1
     clean_df = impression_data.drop(
         ['time', 'impressionID', 'ip', 'searchID'], axis=1)
     del impression_data
     clean_array = clean_df.values.tolist()
     del clean_df
     self._rows = len(clean_array)
     self._cols = self._params['userID_size'] + self._params[
         'adID_size'] + self._params['position_size']
     return clean_array, train_y
예제 #3
0
def main() -> None:
    """Run."""
    root = Path(__file__).absolute().parents[1]
    path_readme = root / "README.md"
    path_coverage = root / "COVERAGE"
    placeholder_tag = "Code Coverage"
    regexp_pattern = rf"\[\!\[{placeholder_tag}\]\(.*\)\]\(.*\)"

    run_gocover(path_coverage)

    coverage = utils.read(path_coverage)

    coverage_pct = extract_total_coverage(coverage)

    badge_url = generate_url(coverage_pct)

    inpt = utils.read(path_readme)

    search = re.findall(regexp_pattern, inpt)

    if not search:
        raise Exception(
            f"No placeholder found in README.md. Add '[![{placeholder_tag}]()]()'."
        )

    placeholder_inject = f"[![{placeholder_tag}]({badge_url})]({badge_url})"

    out = re.sub(regexp_pattern, placeholder_inject, inpt)

    utils.write(out, path_readme)
예제 #4
0
    def __init__(self, load=False):
        self.sizes = layer_sizes
        self.layers_num = len(layer_sizes)

        self.test_img, self.test_res = read('test')
        self.test_n = len(self.test_img)
        self.test_data = self.test_img.reshape((self.test_n, -1, 1))
        self.test_lbl = np.zeros((self.test_n, 10, 1))
        self.test_lbl[range(self.test_n), self.test_res, 0] = 1

        self.train_img, self.train_res = read('train')
        self.train_n = len(self.train_img)
        self.train_data = self.train_img.reshape((self.train_n, -1, 1))
        self.train_lbl = np.zeros((self.train_n, 10, 1))
        self.train_lbl[range(self.train_n), self.train_res, 0] = 1

        # 预处理
        if load:
            self.load_model()
        else:
            self.weights = [
                np.random.randn(y, x)
                for x, y in zip(layer_sizes[:-1], layer_sizes[1:])
            ]
            self.biases = [np.random.randn(y, 1) for y in layer_sizes[1:]]
def divide_train_dev(tweets):
    train_categories = read(TRAIN)
    dev_categories = read(DEV)
    train = []
    dev = []

    for tweet in tweets:
        if tweet.get('reply_to'):
            el = {
                'text': tweet['text'],
                'reply_to': tweet['reply_to']
            }

            if tweet['id'] in train_categories:
                el['group'] = train_categories[tweet['id']]
                train += [el]
                # train += [{
                #     'text': tweet['text'],
                #     'reply_to': tweet['reply_to'],
                #     'group': train_categories[tweet['id']]
                # }]
            else:
                el['group'] = dev_categories[tweet['id']]
                dev += [el]
                # dev += [{
                #     'text': tweet['text'],
                #     'reply_to': tweet['reply_to'],
                #     'group': dev_categories[tweet['id']]
                # }]
                # all += [el]

    write('data/train.json', train)
    write('data/dev.json', dev)
    write('data/groups.json', dict(train_categories.items() | dev_categories.items()))
예제 #6
0
def make_dnn_feats(fpath, noise_path, snr, P, maxlen=1339):
    speech = read(fpath)
    noise = read(noise_path)
    noise = pad_noise(speech, noise)
    blend = generate_noisy(speech, noise, snr)

    mel_clean = librosa.feature.melspectrogram(y=speech,
                                               sr=16000,
                                               n_fft=512,
                                               hop_length=256,
                                               n_mels=64)
    mel_noisy = librosa.feature.melspectrogram(y=blend,
                                               sr=16000,
                                               n_fft=512,
                                               hop_length=256,
                                               n_mels=64)

    feats = pad(window(mel_noisy, P), maxlen).T
    target = np.log(mel_clean)
    mask = pad(np.ones((target.shape[0], target.shape[1])), maxlen).T
    target = pad(target, maxlen).T
    return {
        'x': torch.tensor(feats),
        't': torch.tensor(target),
        'mask': torch.tensor(mask)
    }
예제 #7
0
def q_transform(fname, noise_path, cluster_path, snr, P, maxlen=1339):
    G_mat = np.load(cluster_path).T
    A_t = []

    speech = read(fname)
    noise = read(noise_path)
    noise = pad_noise(speech, noise)
    blend = generate_noisy(speech, noise, snr)

    mel_clean = librosa.feature.melspectrogram(y=speech,
                                               sr=16000,
                                               n_fft=512,
                                               hop_length=256,
                                               n_mels=64)
    mel_noisy = librosa.feature.melspectrogram(y=blend,
                                               sr=16000,
                                               n_fft=512,
                                               hop_length=256,
                                               n_mels=64)
    for timestep in range(mel_noisy.shape[1]):
        sums = []
        for a in range(G_mat.shape[1]):
            diff = np.sum(mel_clean[:, timestep] -
                          np.multiply(G_mat[:, a], mel_noisy[:, timestep]))
            sums.append(diff)
        sums = np.asarray(sums)
        A_t.append(np.argmin(sums))
    A_t = np.asarray(A_t).reshape(1, -1)

    feats = pad(window(mel_noisy, P), maxlen).T
    target = np.pad(A_t, ((0, 0), (0, maxlen - A_t.shape[1])),
                    mode='constant',
                    constant_values=(-1)).T
    #print("feats:", feats.shape, "target:", target.shape)
    return {"x": feats, "t": target}
예제 #8
0
def exec_brute():
    enc = read(argv[2])
    keys = read(argv[3]).split('\n')
    alph = read(argv[4])
    f = get_file_write('bruted.txt')
    for key in keys:
        f.write('КЛЮЧ : {}\n{}\n\n'.format(key, decrypt(enc, alph, key)))
        f.write('-------------------------------\n')
    f.close()
예제 #9
0
def main(last_days):
    solar = pd.DataFrame(list(read('./data/solar.data', last_days=last_days)),
                         columns=['datetime', 'voltage (V)', 'current (mA)'])
    battery = pd.DataFrame(list(
        read('./data/battery.data', last_days=last_days)),
                           columns=['datetime', 'voltage (V)', 'current (mA)'])
    system = pd.DataFrame(list(read('./data/system.data',
                                    last_days=last_days)),
                          columns=['datetime', 'CPU', 'RAM'])
    temperature = pd.DataFrame(list(
        read('./data/temperature.data', last_days=last_days)),
                               columns=['datetime', 'CPU', 'GPU'])

    def prep(df):
        df = df[df['datetime'] > 0]
        df['datetime'] = pd.to_datetime(df['datetime'], unit='s')
        df = df.set_index('datetime')
        df = df.resample('T').median()  # average per minute
        df = df.reset_index()
        df['datetime'] = df['datetime'].astype(int64) // 10**9
        return df

    solar = prep(solar)
    battery = prep(battery)
    system = prep(system)
    temperature = prep(temperature)

    solar['power (mW)'] = solar['voltage (V)'] * solar['current (mA)']
    battery['power (mW)'] = battery['voltage (V)'] * battery['current (mA)']

    print('Battery voltage (V)')
    tplot.scatter(battery['datetime'], battery['voltage (V)'], ylim=(3, 4.2))
    print()
    print('Power consumption (red) and solar generation (yellow) (mW)')
    tplot.scatter([battery['datetime'], solar['datetime']],
                  [battery['power (mW)'], solar['power (mW)']],
                  color=('red', 'yellow'),
                  ylim=(0,
                        max(
                            1,
                            max(battery['power (mW)'].max(),
                                solar['power (mW)'].max()))))
    print()
    print('CPU (cyan) and RAM (magenta) utilization (%)')
    tplot.scatter([system['datetime'], system['datetime']],
                  [system['CPU'], system['RAM']],
                  color=('cyan', 'magenta'),
                  ylim=(0, 100))
    print()
    print('CPU (cyan) and GPU (red) temperate')
    tplot.scatter([temperature['datetime'], temperature['datetime']],
                  [temperature['CPU'], temperature['GPU']],
                  color=('cyan', 'red'),
                  ylim=(0, 100))
예제 #10
0
def main() -> None:
    """Run."""
    header = utils.read(ROOT / PATH_LICENSE_HEADER)
    header = adds_comment_sign(header, "#")

    files_codebase = files_at_path(str(ROOT / "libs"))

    files_missing_header = [
        ifile for ifile in files_codebase
        if header not in utils.read(ifile).strip()
    ]
    if files_missing_header:
        throw_missing_license(header, files_missing_header)
예제 #11
0
def main():
	base_path = os.environ.get('BASE')
	local_path = os.environ.get('LOCAL')
	remote_path = os.environ.get('REMOTE')
	merged_path = os.environ.get('MERGED')
	
	print('BASE: %s' % (base_path,))
	print('LOCAL: %s' % (local_path,))
	print('REMOTE: %s' % (remote_path,))
	print('MERGED: %s' % (merged_path,))
	print()

	# Ensure paths have been provided
	for path in (base_path, local_path, remote_path, merged_path):
		if path is None:
			print('Missing path for base, local, remote or merged')
			sys.exit(1)
			return

	# Ensure source paths exist
	for path in (base_path, local_path, remote_path):
		if not os.path.exists(path):
			print('File doesn\'t exist at: %s' % (path,))
			sys.exit(1)
			return

	# Parse items
	base, _ = read(base_path)
	local, unix = read(local_path)
	remote, _ = read(remote_path)

	# Find changes between base and remote
	print('Finding change(s)...')
	
	changes = list(resolve(base, local, remote))

	# Apply changes to local
	print('Applying %d change(s)...' % (len(changes),))

	patch(changes, local, in_place=True)

	# Write result to file
	print('Writing result (unix: %r)...' % (unix,))

	write(
		merged_path, local,
		unix=unix
	)
예제 #12
0
def process_bill(bill_id, options):
    fdsys_xml_path = _path_to_billstatus_file(bill_id)
    logging.info("[%s] Processing %s..." % (bill_id, fdsys_xml_path))

    # Read FDSys bulk data file.
    xml_as_dict = read_fdsys_bulk_bill_status_file(fdsys_xml_path, bill_id)
    bill_data = form_bill_json_dict(xml_as_dict)

    # Convert and write out data.json and data.xml.
    utils.write(
        unicode(json.dumps(bill_data, indent=2, sort_keys=True)),
        os.path.dirname(fdsys_xml_path) + '/data.json')

    from bill_info import create_govtrack_xml
    with open(os.path.dirname(fdsys_xml_path) + '/data.xml', 'wb') as xml_file:
        xml_file.write(create_govtrack_xml(bill_data, options))

    if options.get("amendments", True):
        process_amendments(bill_id, xml_as_dict, options)

    # Mark this bulk data file as processed by saving its lastmod
    # file under a new path.
    utils.write(
        utils.read(_path_to_billstatus_file(bill_id).replace(".xml", "-lastmod.txt")),
        os.path.join(os.path.dirname(fdsys_xml_path), "data-fromfdsys-lastmod.txt"))

    return {
        "ok": True,
        "saved": True,
    }
예제 #13
0
def evl_and_save(wave_file,
                 std_txt_file,
                 evl_file,
                 framerate=16000,
                 stop_on_failure=True,
                 timeout=600):
    """ std_txt_file and evl_fp can be file_paths or io.StringIOs """
    if framerate == 8000 or framerate == "8000" or framerate == "8k" or framerate == "8K":
        tmp_wav_path = io.BytesIO()
        utils.wav_8kto16k(wave_file, tmp_wav_path)
        wave_file = tmp_wav_path
    text = utils.read(std_txt_file)
    result = _evl(wave_file, text, timeout=timeout)
    tmp_evl_dict = json.loads(result)
    logging.debug("Evaluation: %s" % tmp_evl_dict.get('desc'))
    if tmp_evl_dict['code'] == '0':
        utils.write(evl_file, result, 'w')
    else:
        logging.error(result)
        if stop_on_failure:
            raise Exception(
                "evl error while processing %s: code %s, desc - %s" %
                (wave_file, tmp_evl_dict.get('code'),
                 tmp_evl_dict.get('desc')))
        else:
            logging.error("evl error while processing %s: code %s, desc - %s" %
                          (wave_file, tmp_evl_dict.get('code'),
                           tmp_evl_dict.get('desc')))
예제 #14
0
def create_table_json():
    data = read('data/data.json')
    all_tweets = {}
    for rumour, rumour_data in data.items():
        for x, thread in rumour_data.items():
            all_tweets[thread['source']['id_str']] = {
                'rumour': rumour,
                'text': thread['source']['text'],
                'id': thread['source']['id_str']
            }

            for key, tweet in thread['replies'].items():
                tokenized_tweet = tweet_tokenize(tweet['text'])
                all_tweets[key] = {
                    'rumour': rumour,
                    'text': tokenized_tweet,
                    'tags': tag_part_of_speech(tokenized_tweet),
                    'id': key,
                    'reply_to': tweet['in_reply_to_status_id_str']
                    # 'reply_to': all_tweets[tweet['in_reply_to_status_id_str']]['text']
                }

    for id, tweet in all_tweets.items():
        if 'reply_to' in tweet:
            tweet['reply_to'] = all_tweets[tweet['reply_to']]['text']

    write('data/tweets.json', list(all_tweets.values()))
    to_csv(list(all_tweets.values()))
    return all_tweets.values()
예제 #15
0
def read_recipies_v2(file_name):
    data = json.loads(u.read(file_name))
    recipes = []
    for recipe in data:
        try:
            r = dict()
            r['name'] = recipe["name"]
            r["summary"] = recipe["summary"]
            r["content"] = recipe["content"]
            r["preparationTime"] = int(recipe["preparationTime"])
            r["servings"] = int(recipe["servings"])
            ing = []
            for ingredient in recipe["ingredients"]:
                i = dict()
                i["quantity"] = float(ingredient["quantity"])
                i["name"] = ingredient["name"]
                i["measurementUnit"] = ingredient["measurementUnit"]
                i["category"] = ingredient["category"]
                i["cost"] = float(ingredient["price"])
                i["weight"] = float(ingredient["weight"])
                i["priceCurrency"] = ingredient["price_currency"]
                ing.append(i)
            r["ingredients"] = ing
            recipes.append(r)
        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            print("Failed to parse recipie. Error", str(e), "Line:",
                  exc_tb.tb_lineno)
    return recipes
예제 #16
0
def main():
	'''
	1. choose center uniformly
	2. compute distance between x and nearest center D(x)
	3. Choose 1 new data point at random as new center
	using weighted probability distribution where x is 
	chosen with prob proportion D(x)^2
	4. Repeat 2-3 until k centers have been chosen
	5. continue using k-means
	'''
	k = 3
	datafile = "toydata.txt"
	data_pnts = utils.read(datafile)
	#initialize first centroid

	centroids = get_centroids(data_pnts,k)
	# stupid way of running plots 20 times, too tired to be more elegant
	rv = k_means.run(centroids,data_pnts,k,"cost++.png")
	for j in range(19):
		centroids = get_centroids(data_pnts,k)
		temp = k_means.run(centroids,data_pnts,k,"cost++.png")
		rv = np.concatenate((rv,temp),axis = 0)

	plt.figure()

	for k in rv:
		k_means.plot_cost(k[0],k[1])
	plt.savefig("cost++.png")
예제 #17
0
def read_fdsys_bulk_bill_status_file(fn, bill_id):
    fdsys_billstatus = utils.read(fn)
    return xmltodict.parse(fdsys_billstatus,
                           force_list=(
                               'item',
                               'amendment',
                           ))
예제 #18
0
	def var_score(self, file):
		df = read(file)
		scores = []

		for var in ['temperature']:
			print(variable_map[var])
			print(self.dict[var]['bins'])
			print(self.dict[var]['labels'])
			# TO DO : fix non-unique mapping issue
			score = pd.cut(df[variable_map[var]],
						   bins=self.dict[var]['bins'],
						   labels=self.dict[var]['labels'])
			score = score.max()
			print(score)

		# for var in self.dict:
		# 	try:
		# 		score = pd.cut(df[variable_map[var]],
		# 					   bins=self.dict[var]['bins'],
		# 					   labels=self.dict[var]['labels'])
		# 	except:
		# 		print('Variable: "%s" not in dataframe' % var)
		# 		continue
			scores += [score]

		return scores
예제 #19
0
def read_file(path, counter, update_kb=False):
    sentences = parse(read(path), ["\n\n", "\n", "\t"])
    for sentence in sentences:
        sentence = Sentence(sentence)
        counter.update(sentence, update_kb=update_kb)
    print("read %d sentences" % len(sentences))
    return len(sentences)
예제 #20
0
def process_bill(bill_id, options):
    fdsys_xml_path = _path_to_billstatus_file(bill_id)
    logging.info("[%s] Processing %s..." % (bill_id, fdsys_xml_path))

    # Read FDSys bulk data file.
    xml_as_dict = read_fdsys_bulk_bill_status_file(fdsys_xml_path, bill_id)
    bill_data = form_bill_json_dict(xml_as_dict)

    # Convert and write out data.json and data.xml.
    utils.write(
        unicode(json.dumps(bill_data, indent=2, sort_keys=True)),
        os.path.dirname(fdsys_xml_path) + '/data.json')

    from bill_info import create_govtrack_xml
    with open(os.path.dirname(fdsys_xml_path) + '/data.xml', 'wb') as xml_file:
        xml_file.write(create_govtrack_xml(bill_data, options))

    if options.get("amendments", True):
        process_amendments(bill_id, xml_as_dict, options)

    # Mark this bulk data file as processed by saving its lastmod
    # file under a new path.
    utils.write(
        utils.read(_path_to_billstatus_file(bill_id).replace(".xml", "-lastmod.txt")),
        os.path.join(os.path.dirname(fdsys_xml_path), "data-fromfdsys-lastmod.txt"))

    return {
        "ok": True,
        "saved": True,
    }
예제 #21
0
    def __getitem__(self, index):
        fname = self.files[index]

        # (# bodies, # frames, # keypoints, xy)
        f = None

        f = utils.read(os.path.join(self.root_dir, fname))
        # f = self._valid_data(f)

        # Pin to one of the keypoints
        f = self._pin_skeleton(f)

        # Align the frames
        f = self._align_frames(f)
        # assert f.shape[1] == self.num_frames, "wrong frames %d" % f.shape[1]

        # At most 1
        if self.pin_body is not None and self.scale_each_body:
            for i in range(4):
                f[i // 2, ...,
                  i % 2] /= np.abs(f[i // 2, ..., i % 2]).max() + 1e-5

        if self.merge == 1:
            f = f.reshape((*f.shape[:2], 50))
        elif self.merge == 2:
            f = f.reshape((f.shape[0] * self.num_frames, 50))

        return f
예제 #22
0
def main():
    movie_list = utils.read()

    features = {
        '0': crawler.start_scraper,
        '1': analyzer.get_improved_pops,
        '2': analyzer.get_prev_positions,
        '3': analyzer.get_top_rated_genre
    }

    while True:
        msg = '''
0: Enter 0 to fetch latest movies from popular movies list,
1: Enter 1 to veiw movie list that have improved in popularity,
2: Enter 2 to view previous week's positions,
3: Enter 3 to see top rated genre,
n: Enter n/N to exit: '''
        choice = input(msg)
        validate_choice(choice)

        func = features.get(choice)
        if func:
            result = func(movie_list=movie_list)
        else:
            print(f"Invalid operation: {choice}")
            result = None

        choice = input('Continue? (y/n): ')
        validate_choice(choice)
예제 #23
0
def main():
    path_len = 100
    nb_feature = 16
    weibo_file = os.path.join(project_folder, 'dataset', 'weibo', 'weibo.txt')
    lines = utils.read_lines(weibo_file)
    x = []
    y = []
    i = 1
    for line in lines:
        print(i)
        i += 1
        line = line.replace('\t', ' ')
        sp = line.split(' ')
        eid = sp[0].split(':')[1]
        label = sp[1].split(':')[1]
        y.append(int(label))
        f = []
        json_file = os.path.join(project_folder, 'dataset', 'weibo', 'Weibo',
                                 eid + '.json')
        text_content = utils.read(json_file)
        json_content = json.loads(text_content)
        for post in json_content[0:path_len]:
            f.append(get_feature(post))
        if len(f) < path_len:
            for j in range(path_len - len(f)):
                f.append([0 for j in range(nb_feature)])
        x.append(f)

    y = numpy.array(y)
    x = numpy.array(x)

    print(x.shape, y.shape)
    numpy.save(os.path.join(project_folder, 'feature', 'weibo', 'x.npy'), x)
    numpy.save(os.path.join(project_folder, 'feature', 'weibo', 'y.npy'), y)
예제 #24
0
파일: main.py 프로젝트: yp05327/esoraidbot
async def on_ready():
    global raidinfo
    print('Logged in as')
    print(client.user.name)
    print(client.user.id)
    print('------')
    raidinfo = utils.read(config.info['record_file_name'])
 def _create_rms_job_script(self):
     """
     Renders the job RMS template with RMS context and creates the job script file on case working directory.
     """
     job_path = os.path.join(self.work_dir, RMS_JOB_FILE_NAME)
     job_content = Template(read(RMS_JOB_FILE_PATH)).render(self._get_rms_context())
     write(job_path, job_content)
예제 #26
0
def main():
    '''Instantiates program wide info'''
    k = 3
    datafile = "toydata.txt"
    data_pnts = utils.read(datafile)

    rv = run(data_pnts,k)
예제 #27
0
def post(path, apikey, url, **kwargs):
    output = kwargs.get('output', 'xml')
    cat = kwargs.get('cat', 'Default')
    priority = kwargs.get('priority', '-100')
    pp = kwargs.get('pp', '-1')
    nzbname = kwargs.get('nzbname', '')
    form = MultiPartForm() 
    nzb_data = utils.read(path, 'rb')
    form.add_field('apikey', apikey)
    form.add_field('mode', 'addfile')
    form.add_field('output', output)
    form.add_field('cat', cat)
    form.add_field('priority', priority)
    form.add_field('pp', pp)
    form.add_field('nzbname', nzbname)
    form.add_file('nzbfile', path, fileHandle=StringIO(nzb_data))
    req = urllib2.Request(url)
    body = str(form)
    req.add_header('Content-type', form.get_content_type())
    req.add_header('Content-length', len(body))
    req.add_data(body)
    response = urllib2.urlopen(req)
    # Dont care about the response
    response.close()
    return "ok"
예제 #28
0
def main():
    movie_list = utils.read()

    features = {
        '1': crawler.start_scraper,
        '2': analyzer.get_improved_pops,
        '3': analyzer.get_prev_positions,
        '4': analyzer.get_top_rated_genre
    }

    def run():
        while True:
            choice = input(f"{menu}: ")
            utils.validate_choice(choice)

            func = features.get(choice, None)
            if func:
                result = func(movie_list=movie_list, verbose=__verbosity)
            else:
                print(f"Invalid operation: {choice}")
                result = None

            choice = input('Continue? (y/n): ')
            utils.validate_choice(choice)

    args = sys.argv[1:]
    if args:
        if args[0] == '-h':
            print_help()
        else:
            print("Invalid Opetion, see the help below to run the program:")
            print_help()
    else:
        run()
예제 #29
0
파일: player.py 프로젝트: hekevintran/Rpg
	def createNewPlayerFromStdIn(self):
		self._readLoginAndPassword(True, True)

		genders = gender.model.loadAll()
		nbGenders = len(genders)

		for k, v in enumerate(genders):
			print(v['name'] + " (" + str(k) + ")")

		g = -1
		while g < 0 or g >= nbGenders:
			g = utils.read("Character gender: ")
			try:
				g = int(g)
			except:
				g = -1

		genderId = genders[g]['id_gender']

		sps = species.model.getSpecies(genders[g]['name'])
		nbSpecies = len(sps)

		for k, v in enumerate(sps):
			print(v['name'] + " (" + str(k) + ")")
			print(v['description'])

		sp = -1
		while sp < 0 or sp >= nbSpecies:
			sp = utils.read("Character species: ")
			try:
				sp = int(sp)
			except:
				sp = -1

		speciesId = sps[sp]['id_species']

		self._model = {
			'login': self._login,
			'name': self._login,
			'password': self._password,
			'id_species': speciesId,
			'id_gender': genderId,
			'id_area': 1
		}

		self._model['id_character'] = character.model.insert(self._model)
		model.insert(self._model)
예제 #30
0
 def _read_datasets(dataset, target):
     for person, gesture, files in dataset:
         print_log('Processing {:<3s} - {:<3s}'.format(person, gesture))
         for f in files:
             idx = int(f[:f.find('_')])
             path = os.path.join(base_dir, person, gesture, f)
             key, val = read(path)
             target[person][gesture][idx][key] = val
예제 #31
0
 def run(self):
     if self.use_pro_api:
         tmp_wav_path = io.BytesIO()
         utils.wav_8kto16k(self.wav_file,tmp_wav_path)
         file_content = utils.read(tmp_wav_path, 'rb')
     else:
         file_content = utils.read(self.wav_file, 'rb')
     # print(len(file_content))
     max_retry = config.RCG_MAX_RETRY
     for retry in range(max_retry + 1):
         try:
             # 注明pcm而非wav,免去再次百度转换(可在一定情况下避免err3301:音质问题)
             # 使用1537-8k 30qps测试
             if self.use_pro_api:
                 rst = self.aip_speech.asr_pro(file_content, 'pcm', 16000,
                                         {'dev_pid': 80001})
             else:
                 rst = self.aip_speech.asr(file_content, 'pcm', 8000,
                                         {'dev_pid': 1537})
             """
            dev_pid	语言	                     模型      是否有标点	    备注
             1536	普通话(支持简单的英文识别)	搜索模型	    无标点	支持自定义词库
             1537	普通话(纯中文识别)        输入法模型	有标点	不支持自定义词库
             1737	英语		                            无标点	不支持自定义词库
             1637	粤语		                            有标点	不支持自定义词库
             1837	四川话		                        有标点	不支持自定义词库
             1936	普通话远场	            远场模型	    有标点	不支持 
             """
             if rst['err_no'] == 0:
                 self.result = rst['result'][0]  # rcg text
                 logging.debug("Recognition: %s" % self.result)
                 break
             elif rst['err_no'] == 3304 or rst['err_no'] == '3304':  # qps超限,等待一秒
                 logging.warning('qps超限(等待1秒):%s' % rst.get('err_msg'))
                 time.sleep(1)
             elif rst['err_no'] == 3301:  # 音质差,返回空结果,不再重试
                 self.result = ''
                 logging.warning('音频质量差:%s' % rst.get('err_msg'))
                 break
             else:
                 logging.error('识别错误:%s' % rst.get('err_msg'))
                 logging.error(rst)
                 raise Exception('Recognition failed!')
         except Exception as e:
             logging.warning('RcgCore: on retry %d:' % retry)
             logging.warning(e)
 def get_runtime(self):
     """
     Returns case runtime.
     """
     runtime = "-"
     runtime_file = os.path.join(self.work_dir, RUNTIME_FILE)
     if os.path.exists(runtime_file): runtime = read(runtime_file)
     return self.safely_convert(runtime.rstrip("\n"), float)
예제 #33
0
 def decrypt(self):
     self.decrypted_text = None
     text = read(self.ui.line_openfile_decrypt.text())
     keyword = self.ui.line_key.text()
     self.tricemus.change_key(keyword=keyword)
     decrypt_text = self.tricemus.decrypt(text)
     self.decrypted_text = decrypt_text
     write(self.ui.line_savefile_decrypt.text(), decrypt_text)
예제 #34
0
파일: parsers.py 프로젝트: onchere/whack
def getGrammarList():
    lines = read("../build/whack.grammar")
    grammars = []
    for line in lines.split('\n'):
        match = re.match('[a-zA-Z]+', line)
        if match != None:
            grammars.append(match.group())
    return grammars
 def _create_input_files(self):
     """
     Renders the input files given in the config with application context and creates them on case working directory.
     """
     for input_ in self.config["inputs"]:
         template = read(input_["template"])
         write(os.path.join(self.work_dir, input_["name"]),
               Template(template).render(self._get_application_context()))
예제 #36
0
def reparse_actions(bill_id, options):
    # Load an existing bill status JSON file.
    data_json_fn = output_for_bill(bill_id, 'json')
    source = utils.read(data_json_fn)
    bill_data = json.loads(source)

    # Munge data.
    from bill_info import parse_bill_action
    title = bill_info.current_title_for(bill_data['titles'], 'official')
    old_status = None
    for action in bill_data['actions']:
      new_action, new_status = parse_bill_action(action, old_status, bill_id, title)
      if new_status:
        old_status = new_status
        action['status'] = new_status
      # clear out deleted keys
      for key in ('vote_type', 'how', 'where', 'result', 'roll', 'suspension', 'calendar', 'under', 'number', 'committee', 'pocket', 'law', 'congress'):
        if key in action and key not in new_action:
          del action['key']
      action.update(new_action)

    status, status_date = bill_info.latest_status(bill_data['actions'], bill_data['introduced_at'])
    bill_data['status'] = status
    bill_data['status_at'] = status_date

    # Show user a diff on the console to accept changes.
    def show_diff_ask_ok(source, revised, fn):
      if source == revised: return False # nothing to do
      def split_lines(s): return [l+"\n" for l in s.split("\n")]
      import sys
      from difflib import unified_diff
      sys.stdout.writelines(unified_diff(split_lines(source), split_lines(revised), fromfile=fn, tofile=fn))
      return raw_input("Apply change? (y/n) ").strip() == "y"

    wrote_any = False

    # Write new data.json file.
    revised = json.dumps(bill_data, indent=2, sort_keys=True)
    if show_diff_ask_ok(source, revised, data_json_fn):
      utils.write(revised, data_json_fn)
      wrote_any = True

    # Write new data.xml file.
    from bill_info import create_govtrack_xml
    data_xml_fn = data_json_fn.replace(".json", ".xml")
    with open(data_xml_fn, 'r') as xml_file:
        source = xml_file.read()
    revised = create_govtrack_xml(bill_data, options)
    if show_diff_ask_ok(source, revised.decode("utf8"), data_xml_fn):
      with open(data_xml_fn, 'wb') as xml_file:
        xml_file.write(revised)
      wrote_any = True

    return {
        "ok": True,
        "saved": wrote_any,
        "reason": "no changes or changes skipped by user",
    }
예제 #37
0
파일: Rpg.py 프로젝트: hekevintran/Rpg
	def _doInteractiveAuth(self):
		choice = 0
		while choice != '1' and choice != '2':
			choice = utils.read("new account (1) or login (2) ? ")

		if choice == '1':
			self._player.createNewPlayerFromStdIn()
		elif choice == '2':
			self._player.loadPlayerFromStdIn()
예제 #38
0
파일: fdsys.py 프로젝트: GPHemsley/congress
def get_sitemap(year, collection, lastmod, options):
  """Gets a single sitemap, downloading it if the sitemap has changed.
  
  Downloads the root sitemap (year==None, collection==None), or
  the sitemap for a year (collection==None), or the sitemap for
  a particular year and collection. Pass lastmod which is the current
  modification time of the file according to its parent sitemap, which
  is how it knows to return a cached copy.
  
  Returns the sitemap parsed into a DOM.
  """
  
  # Construct the URL and the path to where to cache the file on disk.
  if year == None:
    url = "http://www.gpo.gov/smap/fdsys/sitemap.xml"
    path = "fdsys/sitemap/sitemap.xml"
  elif collection == None:
    url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/sitemap_%s.xml" % (year, year)
    path = "fdsys/sitemap/%s/sitemap.xml" % year
  else:
    url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/%s_%s_sitemap.xml" % (year, year, collection)
    path = "fdsys/sitemap/%s/%s.xml" % (year, collection)
    
  # Should we re-download the file?
  lastmod_cache_file = utils.cache_dir() + "/" + path.replace(".xml", "-lastmod.txt")
  if options.get("cached", False):
    # If --cached is used, don't hit the network.
    force = False
  elif not lastmod:
    # No *current* lastmod date is known for this file (because it is the master
    # sitemap file, probably), so always download.
    force = True
  else:
    # If the file is out of date or --force is used, download the file.
    cache_lastmod = utils.read(lastmod_cache_file)
    force = (lastmod != cache_lastmod) or options.get("force", False)
    
  if force:
    logging.warn("Downloading: %s" % url)
    
  body = utils.download(url, path, utils.merge(options, {
    'force': force, 
    'binary': True
  }))
  
  if not body:
      raise Exception("Failed to download %s" % url)
      
  # Write the current last modified date to disk so we know the next time whether
  # we need to fetch the file.
  if lastmod and not options.get("cached", False):
    utils.write(lastmod, lastmod_cache_file)
  
  try:
    return etree.fromstring(body)
  except etree.XMLSyntaxError as e:
    raise Exception("XML syntax error in %s: %s" % (url, str(e)))
예제 #39
0
파일: Rpg.py 프로젝트: hekevintran/Rpg
	def readCommand(self):
		"""
		Method to set the autocompleter and run the prompt, from utils
		"""

		completer = command.completer()
		readline.set_completer(completer.complete)
		readline.parse_and_bind('tab: complete')
		readline.set_completer_delims('')
		return utils.read("Command: ")
예제 #40
0
파일: fdsys.py 프로젝트: JT5D/congress
def mirror_file(year, collection, package_name, lastmod, granule_name, file_types, options):
  # Where should we store the file?
  path = get_output_path(year, collection, package_name, granule_name, options)
  if not path: return # should skip
  
  # Do we need to update this record?
  lastmod_cache_file = path + "/lastmod.txt"
  cache_lastmod = utils.read(lastmod_cache_file)
  force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False)
  
  # Try downloading files for each file type.
  targets = get_package_files(package_name, granule_name, path)
  updated_file_types = set()
  for file_type in file_types:
    if file_type not in targets: raise Exception("Invalid file type: %s" % file_type)
    f_url, f_path = targets[file_type]
    
    if (not force) and os.path.exists(f_path): continue # we already have the current file
    logging.warn("Downloading: " + f_path)
    data = utils.download(f_url, f_path, utils.merge(options, {
      'binary': True, 
      'force': force, 
      'to_cache': False,
      'needs_content': file_type == "text" and f_path.endswith(".html"),
    }))
    updated_file_types.add(file_type)
    
    if not data:
      if file_type == "pdf":
        # expected to be present for all packages
        raise Exception("Failed to download %s" % package_name)
      else:
        # not all packages have all file types, but assume this is OK
        logging.error("file not found: " + f_url)
        continue
    
    if file_type == "text" and f_path.endswith(".html"):
      # The "text" format files are put in an HTML container. Unwrap it into a .txt file.
      # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it?
      #       html.fromstring does auto-detection.
      with open(f_path[0:-4] + "txt", "w") as f:
        text_content = unicode(html.fromstring(data).text_content())
        f.write(text_content.encode("utf8"))
        
  if collection == "BILLS" and "mods" in updated_file_types:
    # When we download bill files, also create the text-versions/data.json file
    # which extracts commonly used components of the MODS XML.
    from bill_versions import write_bill_version_metadata
    write_bill_version_metadata(get_bill_id_for_package(package_name, with_version=True))

  # Write the current last modified date to disk so we know the next time whether
  # we need to fetch the files for this sitemap item.
  if lastmod and not options.get("cached", False):
    utils.write(lastmod, lastmod_cache_file) 
예제 #41
0
파일: cache.py 프로젝트: ownport/webpage
    def retrieve(self, request):

        filename = self._fn(request.url)
        resp = Response()

        headers = utils.read('%s.metadata' % filename)
        if headers:
            try:
                headers = CaseInsensitiveDict(json.loads(headers))
            except ValueError:
                return None
            headers['x-cache'] = 'HIT from %s' % self.__class__.__name__
            resp.url = headers.pop('url', None)
            resp.status_code = headers.pop('status-code', None)
            resp.encoding = headers.pop('encoding', None)
            resp.headers = headers
            resp._content = utils.read(filename)
            return resp
        else:
            return None
예제 #42
0
파일: report.py 프로젝트: ArthurWu/e-leave
def is_need_mail_alert(leave_request):
	alert_cycle = utils.read(utils.resfile(), 'Default', 'email.alert.cycle')
	exec 'cycle = [' + alert_cycle + ']'
	
	if leave_request.delay_days() > sum(cycle):
		return True
	
	days = [sum(cycle[:cycle.index(i)+1]) for i in cycle]
	if leave_request.delay_days() in days:
		return True
	
	return False
예제 #43
0
def profileWrite(chunks):
    writetimes = []
    for chunk in chunks:
        data = utils.read("chunks/" + chunk) 
        t1 = time.time()
        utils.write("output/" + chunk, data)
        t2 = time.time()
        writetimes.append(t2-t1)
    totalWritetime = 0
    for writetime in writetimes:
        totalWritetime = totalWritetime + writetime
    avgWritetime = totalWritetime / len(chunks)
    print "Average write time:", avgWritetime
    print "Total write time:", totalWritetime
    print "Number of chunks written:", len(chunks)
예제 #44
0
파일: monotool.py 프로젝트: jness/monotool
    def __generate_project_template(self, project_name, template):
        """
        Generates template for project.
        """
        pwd = os.path.dirname(__file__)
        filename = '%s/templates/%s.stache' % (pwd, template)

        data = dict(
            project_name=project_name,
            version=get_version(),
            timestamp=timestamp()
        )
        template = read(filename)
        rendered = pystache.render(template, data)
        return rendered
예제 #45
0
파일: govinfo.py 프로젝트: d0tN3t/congress
def mirror_bulkdata_file(collection, url, item_path, lastmod, options):
    # Return a list of files we downloaded.
    results = []

    # Where should we store the file?
    path = "%s/govinfo/%s/%s" % (utils.data_dir(), collection, item_path)

    # For BILLSTATUS, store this along with where we store the rest of bill
    # status data.
    if collection == "BILLSTATUS":
        from bills import output_for_bill
        bill_id, version_code = get_bill_id_for_package(os.path.splitext(os.path.basename(item_path.replace("BILLSTATUS-", "")))[0], with_version=False)
        path = output_for_bill(bill_id, FDSYS_BILLSTATUS_FILENAME, is_data_dot=False)

    # Where should we store the lastmod found in the sitemap so that
    # we can tell later if the file has changed?
    lastmod_cache_file = os.path.splitext(path)[0] + "-lastmod.txt"

    # Do we already have this file up to date?
    if os.path.exists(lastmod_cache_file) and not options.get("force", False):
        if lastmod == utils.read(lastmod_cache_file):
            return

    # With --cached, skip if the file is already downloaded.
    if os.path.exists(path) and options.get("cached", False):
        return

    # Download.
    logging.warn("Downloading: " + path)
    data = utils.download(url, path, utils.merge(options, {
        'binary': True,
        'force': True, # decision to cache was made above
        'to_cache': False,
    }))
    results.append(path)

    if not data:
        # Something failed.
        return

    # Write the current last modified date back to disk so we know the next time whether
    # we need to fetch the file again.
    utils.write(lastmod, lastmod_cache_file)

    return results
예제 #46
0
def run(options):
  amendment_id = options.get('amendment_id', None)
  bill_id = options.get('bill_id', None)
  
  search_state = { }

  if amendment_id:
    amendment_type, number, congress = utils.split_bill_id(amendment_id)
    to_fetch = [amendment_id]

  elif bill_id:
    # first, crawl the bill
    bill_type, number, congress = utils.split_bill_id(bill_id)
    bill_status = fetch_bill(bill_id, options)
    if bill_status['ok']:
      bill = json.loads(utils.read(output_for_bill(bill_id, "json")))
      to_fetch = [x["amendment_id"] for x in bill["amendments"]]
    else:
      logging.error("Couldn't download information for that bill.")
      return None

  else:
    congress = options.get('congress', utils.current_congress())

    to_fetch = bill_ids_for(congress, utils.merge(options, {'amendments': True}), bill_states=search_state)
    if not to_fetch:
      if options.get("fast", False):
        logging.warn("No amendments changed.")
      else:
        logging.error("Error figuring out which amendments to download, aborting.")

      return None

    limit = options.get('limit', None)
    if limit:
      to_fetch = to_fetch[:int(limit)]

  if options.get('pages_only', False):
    return None

  logging.warn("Going to fetch %i amendments from congress #%s" % (len(to_fetch), congress))
  saved_amendments = utils.process_set(to_fetch, fetch_amendment, options)

  # keep record of the last state of all these amendments, for later fast-searching
  save_bill_search_state(saved_amendments, search_state)
예제 #47
0
def linkRatioStats(filepath):
  yearlyRatios = read(filepath)
  means = []
  alphas=[]
  for year in yearlyRatios:
    print year
    if int(year)>=1950:
      filteredData = filter(lambda x: x[3]>100 and x[4]>100, yearlyRatios[year])
      # the histogram of trade
      plt.clf()
      ratioData=map(lambda x: x[2], filteredData)
      bins=np.arange(0, 3, 0.01)
      hist=np.histogram(ratioData, density=True, bins=bins)[0]
      #n, bins, patches = plt.hist(ratioData, normed=1, facecolor='green', alpha=0.75, bins=np.arange(0, 3, 0.01))
      plt.scatter(map(lambda x: log(x),bins[1:]),map(lambda x: log(x),hist),'r', marker='o')
      plt.xlabel('Export Ratio exp(t)/exp(t+1)')
      plt.ylabel('Probability')
      plt.title("Export Ratio Distribution"+str(year))
      plt.savefig(get_images_directory(resource)+'RatioDistribution'+str(year)+'.png')

      plt.clf()
      ratioData=map(lambda x: x[4], filteredData)
      xmin=min(ratioData)
      ahat=1+len(ratioData)*(1/sum([math.log(s/xmin) for s in ratioData]))
      print "MLE ", ahat
      alphas.append(ahat)
      #print plfit(ratioData)
      bins=range(0, 1000, 100)
      hist=np.histogram(ratioData, density=True, bins=bins)[0]
      #n, bins, patches = plt.hist(map(lambda x: x[4], filteredData), normed=1, facecolor='green', alpha=0.75, bins=range(0, 1000000000, 10000000))
      plt.loglog(bins[1:],hist,'r', marker='o')
      plt.xlabel('Export')
      plt.ylabel('Probability')
      plt.title("Dollar Distribution"+str(year)+"  a="+str(ahat))
      plt.savefig(get_images_directory(resource)+'WeightDistribution'+str(year)+'.png')
  plt.clf()
  plt.plot(yearlyRatios.keys(), alphas)
  plt.xlabel('years')
  plt.ylabel('alphas')
  plt.title("Alphas")
  plt.savefig(get_images_directory(resource)+'WeightDistributionAlphas.png')
    
  return 0
예제 #48
0
파일: player.py 프로젝트: hekevintran/Rpg
	def _readLoginAndPassword(self, checkLogin, confirmPassword):
		while self._login is None or self._login == '':
			self._login = utils.read("Login: "******"Password: "******"Confirm password: ")
			else:
				confirmPassword = self._password

			if self._password != confirmPassword:
				print('The passwords do not match')
				self._password = None
예제 #49
0
파일: fdsys.py 프로젝트: hugovk/congress
def should_download_sitemap(lastmod_cache_file, current_lastmod, options):
    # Download a sitemap or just read from our cache?

    if not current_lastmod:
        # No lastmod is known for this file (it's the root of a sitemap
        # tree - this is the first web request).
        return True

    elif options.get("force", False):
        # User requests downloading everything.
        return True

    elif options.get("cached", False):
        # User requests downloading nothing.
        return False

    else:
        # Download if the lastmod from the parent sitemap doesn't agree with
        # the lastmod stored on disk.
        return current_lastmod != utils.read(lastmod_cache_file)
예제 #50
0
파일: nfo.py 프로젝트: jaqb/xbmc-pneumatic
 def __init__(self, nfo_path):
     self.nfo_path = nfo_path
     filename_movie = utils.join(self.nfo_path, ("movie.nfo"))
     filename_tvshow = utils.join(self.nfo_path, ("episode.nfo"))
     self.is_episode = False
     if utils.exists(filename_movie):
         filename = filename_movie
     elif utils.exists(filename_tvshow):
         filename = filename_tvshow
         self.is_episode = True
     try:
         out = parseString(utils.read(filename, "r"))
     except:
         log(("ReadNfoLabels: could not open: %s.nfo") % (xbmc.translatePath(self.nfo_path)))
         out = None
     if out:
         self.info_labels = self._get_info_labels(out)
     else:
         self.info_labels = {"title": os.path.basename(self.nfo_path)}
     self.thumbnail = utils.join(self.nfo_path, "folder.jpg")
     self.fanart = utils.join(self.nfo_path, "fanart.jpg")
예제 #51
0
def get_sitemap(year, collection, lastmod, options):
    # Construct the URL and the path to where to cache the file on disk.
    if year == None:
        url = "http://www.gpo.gov/smap/fdsys/sitemap.xml"
        path = "fdsys/sitemap/sitemap.xml"
    elif collection == None:
        url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/sitemap_%s.xml" % (year, year)
        path = "fdsys/sitemap/%s/sitemap.xml" % year
    else:
        url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/%s_%s_sitemap.xml" % (year, year, collection)
        path = "fdsys/sitemap/%s/%s.xml" % (year, collection)

    # Should we re-download the file?
    lastmod_cache_file = utils.cache_dir() + "/" + path.replace(".xml", "-lastmod.txt")
    if options.get("cached", False):
        # If --cached is used, don't hit the network.
        force = False
    elif not lastmod:
        # No *current* lastmod date is known for this file (because it is the master
        # sitemap file, probably), so always download.
        force = True
    else:
        # If the file is out of date or --force is used, download the file.
        cache_lastmod = utils.read(lastmod_cache_file)
        force = (lastmod != cache_lastmod) or options.get("force", False)

    if force:
        logging.warn("Downloading: %s" % url)

    body = utils.download(url, path, utils.merge(options, {"force": force, "xml": True}))

    if not body:
        raise Exception("Failed to download %s" % url)

    # Write the current last modified date to disk so we know the next time whether
    # we need to fetch the file.
    if lastmod and not options.get("cached", False):
        utils.write(lastmod, lastmod_cache_file)

    return etree.fromstring(body)
예제 #52
0
def profileRead(chunks):
    chunksizes = []
    readtimes = []
    for chunk in chunks:
        t1 = time.time()
        data = utils.read("chunks/" + chunk)
        t2 = time.time()
        readtimes.append(t2-t1)
        chunksizes.append(len(data))
    totalReadtime = 0
    for readtime in readtimes:
        totalReadtime = totalReadtime + readtime
    avgReadtime = totalReadtime / len(chunks)
    totalChunksize = 0
    for chunksize in chunksizes:
        totalChunksize = totalChunksize + chunksize
    avgChunksize = totalChunksize / len(chunksizes)
    print "Average read time:", avgReadtime
    print "Total read time:", totalReadtime
    print "Average chunk size:", avgChunksize
    print "Total chunk size:", totalChunksize
    print "Number of chunks read:", len(chunks)
예제 #53
0
def extract(root_dir, directory, filename):
    path = root_dir + directory + "/" + filename
    data = {
        "url": "http://www.armslist.com/posts/" + directory + "/" + filename,
        "gid": directory
    }
    
    page = read(path).decode("ascii", "replace")
    pairs = re.findall("<dt>([\w\s]+):</dt>\s+<dd(?:\sclass=\"[\w\s-]+\")?>(.*?)<", page, flags=re.DOTALL)
    for pair in pairs:
        data[pair[0].strip().lower().replace(" ", "_")] = pair[1].strip()

    doc = fromstring(page)
    try:
        data["title"] = doc.xpath("//h1[@class='title']/text()")[0]
    except:
        print "ERROR", directory, filename
        return
    
    data["text"] = doc.xpath("//section[@class='content']")[0].text_content().strip()
    data["images"] = doc.xpath("//section[@class='images']/figure/img/@src")
    write(json.dumps(data, indent=2), "postings/" + data["gid"] + ".json")
    return data
예제 #54
0
def main():

    datafile = "toydata.txt"
    k = 3

    # Read in file
    data_pnts = utils.read(datafile)
    centroids = utils.init_centroids(data_pnts,k)

    rv = run(centroids,data_pnts,k,"cost.png","k_means.png")
    # run through k_means ~ 19 times to account for random init
    
    for i in range(19):
        centroids = utils.init_centroids(data_pnts,k)

        temp = run(centroids,data_pnts,k,"cost.png")
        rv = np.concatenate((rv,temp),axis = 0)
    plt.figure()


    for j in rv:
        plot_cost(j[0],j[1])
    plt.savefig("cost.png")
예제 #55
0
def prepare(imgPath, detector, w, h):
    '''
    Чтение изображения, resize, возвращение подготовленного для сравнения объекта(-ов)
    
    :param imgPath: путь до изображения
    :param detector: путь до изображения
    :param w: ширина изображения после resize
    :param h: высота изображения после resize
    
    :returns kp: особые точки изображения
    :returns desc: описания особых точек изображения
    '''
    img = utils.read(imgPath)
    print img
    if img == None:
        raise Exception(u"Can't open file '%s'" % imgPath)
    img = utils.resize(img, w, h)
    
    print u'PREPARE %s' % imgPath
    
    if not detector:
        raise Exception("Detector can't be None")
    kp, desc = detector.detectAndCompute(img, None)
    return kp, desc
예제 #56
0
파일: bills.py 프로젝트: TTREN/congress
def bill_ids_for(congress, options, bill_states={}):

    # override if we're actually using this method to get amendments
    doing_amendments = options.get('amendments', False)

    bill_ids = []

    bill_type = options.get('amendment_type' if doing_amendments else 'bill_type', None)
    if bill_type:
        bill_types = [bill_type]
    else:
        bill_types = utils.thomas_types.keys()

    for bill_type in bill_types:

        # This sub is re-used for pulling amendment IDs too.
        if (bill_type in ('samdt', 'hamdt', 'supamdt')) != doing_amendments:
            continue

        # match only links to landing pages of this bill type
        # it shouldn't catch stray links outside of the confines of the 100 on the page,
        # but if it does, no big deal
        link_pattern = "^\s*%s\d+\s*$" % utils.thomas_types[bill_type][1]

        # loop through pages and collect the links on each page until
        # we hit a page with < 100 results, or no results
        offset = 0
        while True:
            # download page, find the matching links
            page = utils.download(
                page_for(congress, bill_type, offset),
                page_cache_for(congress, bill_type, offset),
                options)

            if not page:
                logging.error("Couldn't download page with offset %i, aborting" % offset)
                return None

            # extract matching links
            doc = html.document_fromstring(page)
            links = doc.xpath(
                "//a[re:match(text(), '%s')]" % link_pattern,
                namespaces={"re": "http://exslt.org/regular-expressions"})

            # extract the bill ID from each link
            for link in links:
                code = link.text.lower().replace(".", "").replace(" ", "")
                bill_id = "%s-%s" % (code, congress)

                if options.get("fast", False):
                    fast_cache_path = utils.cache_dir() + "/" + bill_info.bill_cache_for(bill_id, "search_result.html")
                    old_state = utils.read(fast_cache_path)

                    # Compare all of the output in the search result's <p> tag, which
                    # has last major action, number of cosponsors, etc. to a cache on
                    # disk to see if any major information about the bill changed.
                    parent_node = link.getparent()  # the <p> tag containing the whole search hit
                    parent_node.remove(parent_node.xpath("b")[0])  # remove the <b>###.</b> node that isn't relevant for comparison
                    new_state = etree.tostring(parent_node)  # serialize this tag

                    if old_state == new_state:
                        logging.info("No change in search result listing: %s" % bill_id)
                        continue

                    bill_states[bill_id] = new_state

                bill_ids.append(bill_id)

            if len(links) < 100:
                break

            offset += 100

            # sanity check, while True loops are dangerous
            if offset > 100000:
                break

    return utils.uniq(bill_ids)
예제 #57
0
def get_bills_to_process(options):
    # Return a generator over bill_ids that need to be processed.
    # Every time we process a bill we copy the fdsys_billstatus-lastmod.txt
    # file to data-fromfdsys-lastmod.txt, next to data.json. This way we
    # know when the FDSys XML file has changed.

    def get_data_path(*args):
        # Utility function to generate a part of the path
        # to data/{congress}/bills/{billtype}/{billtypenumber}
        # given as many path elements as are provided. args
        # is a list of zero or more of congress, billtype,
        # and billtypenumber (in order).
        args = list(args)
        if len(args) > 0:
            args.insert(1, "bills")
        return os.path.join(utils.data_dir(), *args)

    if not options.get('congress'):
        # Get a list of all congress directories on disk.
        # Filter out non-integer directory names, then sort on the
        # integer.
        def filter_ints(seq):
            for s in seq:
                try:
                    yield int(s)
                except:
                    # Not an integer.
                    continue
        congresses = sorted(filter_ints(os.listdir(get_data_path())))
    else:
        congresses = sorted([int(c) for c in options['congress'].split(',')])

    # walk through congresses
    for congress in congresses:
        # turn this back into a string
        congress = str(congress)

        # walk through all bill types in that congress
        # (sort by bill type so that we proceed in a stable order each run)

        bill_types = [bill_type for bill_type in os.listdir(get_data_path(congress)) if not bill_type.startswith(".")]

        for bill_type in sorted(bill_types):

            # walk through each bill in that congress and bill type
            # (sort by bill number so that we proceed in a normal order)

            bills = [bill for bill in os.listdir(get_data_path(congress, bill_type)) if not bill.startswith(".")]
            for bill_type_and_number in sorted(
                bills,
                key = lambda x : int(x.replace(bill_type, ""))
                ):

                fn = get_data_path(congress, bill_type, bill_type_and_number, fdsys.FDSYS_BILLSTATUS_FILENAME)
                if os.path.exists(fn):
                    # The FDSys bulk data file exists. Does our JSON data
                    # file need to be updated?
                    bulkfile_lastmod = utils.read(fn.replace(".xml", "-lastmod.txt"))
                    parse_lastmod = utils.read(get_data_path(congress, bill_type, bill_type_and_number, "data-fromfdsys-lastmod.txt"))
                    if bulkfile_lastmod != parse_lastmod:
                        bill_id = bill_type_and_number + "-" + congress
                        yield bill_id
예제 #58
0
파일: scrub.py 프로젝트: atokop/compling
#!/usr/bin/python
import sys
import os
direc = os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, "scripts/")
sys.path.append(direc)
import u4
import utils
import os
import random
import nltk
 
bayes = utils.read(direc + "dtree.pkl")
feature_words = utils.read(direc + "feat_words.pkl")

if len(sys.argv) < 2:
    print len(sys.argv)
    print "NOT VALID YOU DUNCE!"
else:
    filename = sys.argv[1]
    print filename
    f = open(filename, "r")
    g = open(filename + "clean", "w")
    lines = f.readlines()
    for line in lines:
        features = u4.contains_word_feature_set(line, feature_words)
        # print bayes.classify(features)
        a = "quake" in line or "earthquake" in line
        # if bayes.classify(features):
        if ("quake" in line or "earthquake" in line) and "." in line:
            g.write(line)
            g.write("\n")
예제 #59
0
파일: fdsys.py 프로젝트: GPHemsley/congress
def mirror_package(year, collection, package_name, lastmod, granule_name, file_types, options):
  # Where should we store the file?
  path = get_output_path(year, collection, package_name, granule_name, options)
  if not path: return # should skip
  
  # Do we need to update this record?
  lastmod_cache_file = path + "/lastmod.txt"
  cache_lastmod = utils.read(lastmod_cache_file)
  force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False)
  
  # Try downloading files for each file type.
  targets = get_package_files(package_name, granule_name, path)
  updated_file_types = set()
  for file_type in file_types:
    if file_type not in targets: raise Exception("Invalid file type: %s" % file_type)
    
    # For BILLS, XML was not available until the 108th Congress, though even after that
    # it was spotty until the 111th or so Congress.
    if file_type == "xml" and collection == "BILLS" and int(package_name[6:9]) < 108:
      continue
    
    f_url, f_path = targets[file_type]
    
    if (not force) and os.path.exists(f_path): continue # we already have the current file
    logging.warn("Downloading: " + f_path)
    data = utils.download(f_url, f_path, utils.merge(options, {
      'binary': True, 
      'force': force, 
      'to_cache': False,
      'needs_content': file_type == "text" and f_path.endswith(".html"),
    }))
    updated_file_types.add(file_type)
    
    if not data:
      if file_type in ("pdf", "zip"):
        # expected to be present for all packages
        raise Exception("Failed to download %s" % package_name)
      else:
        # not all packages have all file types, but assume this is OK
        logging.error("file not found: " + f_url)
        continue
    
    if file_type == "text" and f_path.endswith(".html"):
      # The "text" format files are put in an HTML container. Unwrap it into a .txt file.
      # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it?
      #       html.fromstring does auto-detection.
      with open(f_path[0:-4] + "txt", "w") as f:
      	f.write(unwrap_text_in_html(data))

    if file_type == "zip":
      # This is the entire package in a ZIP file. Extract the contents of this file
      # to the appropriate paths.
      with zipfile.ZipFile(f_path) as zf:
        for z2 in zf.namelist():
          if not z2.startswith(package_name + "/"): raise ValueError("Unmatched file name in package ZIP: " + z2)
          z2 = z2[len(package_name)+1:] # strip off leading package name

          if z2 in ("mods.xml", "premis.xml", "dip.xml"):
            # Extract this file to a file of the same name.
            z3 = path + "/" + z2
          elif z2 == "pdf/" + package_name + ".pdf":
            # Extract this file to "document.pdf".
            z3 = path + "/document.pdf"
          elif z2 == "html/" + package_name + ".htm":
            # Extract this file and unwrap text to "document.txt".
            z3 = path + "/document.txt"
          else:
            raise ValueError("Unmatched file name in package ZIP: " + z2)

          with zf.open(package_name + "/" + z2) as zff:
            with open(z3, "w") as output_file:
              data = zff.read()
              if z3 == path + "/document.txt": data = unwrap_text_in_html(data)
              output_file.write(data)
        
  if collection == "BILLS" and "mods" in updated_file_types:
    # When we download bill files, also create the text-versions/data.json file
    # which extracts commonly used components of the MODS XML.
    from bill_versions import write_bill_version_metadata
    write_bill_version_metadata(get_bill_id_for_package(package_name, with_version=True))

  # Write the current last modified date to disk so we know the next time whether
  # we need to fetch the files for this sitemap item.
  if lastmod and not options.get("cached", False):
    utils.write(lastmod, lastmod_cache_file) 
예제 #60
0
import nltk
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
from nltk.corpus import stopwords, state_union
from pprint import pprint as pp
from utils import read

mlk = read('mlk.txt')
stop_words = set(stopwords.words('english'))
words = word_tokenize(mlk)
filtered_mlk = []

for w in words:
    if w not in stop_words:
        filtered_mlk.append(w)

# Fancy One Liner
# filtered_mlk = [w in words if w not in stop_words]

# pp(filtered_mlk)

train_text = state_union.raw('2005-GWBush.txt')
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

custom_tokenized = custom_sent_tokenizer.tokenize(mlk)

def process_content():
    for i in custom_tokenized[5:]:
        words = word_tokenize(i)
        tagged = nltk.pos_tag(words)
        namedEnt = nltk.ne_chunk(tagged);