Python extract примеры, extract.extract Python примеры использования

Пример #1

0

Показать файл

def listImgs(AP, GTS):
    '''List the images that *have* been used by the AHP.
    In other words, the images that are *not* images of opportunity.'''

    from extract import mapping, extract
    #Select the targets that produce image numbers

    #Everything labelled in mapping as IMG
    relKeys = [key for key in mapping if key[-3:] == 'IMG']
    imgs = set()
    for key in relKeys:
        dscnNum = extract(key, AP, GTS)
        if dscnNum:  #if the image is there
            imgs.add(str(int(dscnNum)).zfill(4))

    #Add the other images in the pan:
    panImg = extract("First Pan IMG", AP, GTS)
    if panImg:
        numInPan = extract('PanNum', AP, GTS)
        panImg = int(panImg)  #convert to integer from float
        numInPan = int(numInPan)
        for number in range(1, numInPan):
            #The first image is already there, so don't add it again.
            imgs.add(str((panImg + number) % 10000).zfill(4))

    #Add the second image of the stereo pair:
    sterImg = extract('Stereo IMG', AP, GTS)
    if sterImg:
        imgs.add(str((int(sterImg) + 1) % 10000).zfill(4))
    return imgs

Пример #2

0

Показать файл

Файл: check.py Проект: cnulenka/NoticeAlert

def checkupdates():
        logging.info("Reading from file")
        f=open('docid.txt')
        lists=[]
        lists=map(int,f)
        docid=lists[0]
        f.close()
        logging.info("Reading complete.")
        logging.info("Starting to check for update")
        url= 'http://hib.iiit-bh.ac.in/Hibiscus/Pub/nbDocDet.php?docid={}&client=iiit&iframe=true&nb=Y'.format(docid)
        logging.info("Trying to fetch d url")
        resp = urllib2.urlopen(url)
        respData = resp.read()
        logging.info("Fetching complete.")
        regex='<h1 style="BACKGROUND-COLOR: white; line-height: 2em; margin:0 .5em .2em .5em; padding: 4px 8px 4px 8px; border-radius: 10px;-moz-border-radius: 10px; -webkit-border-radius: 10px; border: 1px solid silver;text-decoration:none; font-size: 2.1em;">(.*?)</h1>'
        pattern =re.compile(regex)
        header=re.findall(pattern,respData)
        logging.info("Got the header")
        if not header:
            logging.info("No new notice found")
            pass
        else:
            logging.info("Got a new notice")
            logging.info("Writing to file")
            docid=docid+1
            f=open('docid.txt','w')
            f.write(str(docid))
            f.close()
            logging.info("Writing  complete.")
            try:
                logging.info("sending html to extract")
                extract.extract(respData,header[0],url)
            except Exception as e:
                logging.error("Calling extract failed %s",e)

Пример #3

0

Показать файл

Файл: test.py Проект: juanma1982/Multi2OIE

def main(args):
    model = utils.get_models(bert_config=args.bert_config,
                             pred_n_labels=args.pred_n_labels,
                             arg_n_labels=args.arg_n_labels,
                             n_arg_heads=args.n_arg_heads,
                             n_arg_layers=args.n_arg_layers,
                             pos_emb_dim=args.pos_emb_dim,
                             use_lstm=args.use_lstm,
                             device=args.device)
    if torch.cuda.is_available():
        map_location = lambda storage, loc: storage.cuda()
    else:
        map_location = 'cpu'
    model.load_state_dict(
        torch.load(args.model_path, map_location=map_location))
    model.zero_grad()
    model.eval()

    loader = load_data(data_path=args.test_data_path,
                       batch_size=args.batch_size,
                       tokenizer_config=args.bert_config,
                       train=False)
    start = time.time()
    extract(args, model, loader, args.save_path)
    print("TIME: ", time.time() - start)
    test_results = do_eval(args.save_path, args.test_gold_path)
    utils.print_results("TEST RESULT", test_results,
                        ["F1  ", "PREC", "REC ", "AUC "])

Пример #4

0

Показать файл

def parse_sharejs(url, html):
    kind = url.rsplit('/', 2)[1]  # kind是大分类，区别tag_list
    html = html.decode('utf-8')  # decode here
    title = extract('<h1>', '</h1>',
                    extract('<div class="post_title">', '</div>', html))
    post_content = extract('<div class="post_content" id="paragraph">',
                           '<div class="hot_tags">', html)
    if not post_content:
        post_content = extract('<div class="post_content" id="paragraph">',
                               '<div class="share">', html)

    post_content = re.sub(r'<span class="title">(.*?)</span>', '',
                          post_content)
    content = html2markdown(post_content)
    try:
        tag_list = extract_all(
            '">', '</a>', extract('<div class="hot_tags">', '</div>', html))
    except AttributeError:
        tag_list = []

    data = {
        'kind': kind,
        'title': title,
        'source_url': url,
        'source': 'www.sharejs.com',
        'content': content,
        'tag_list': tag_list,
        'read_count': 0,
    }
    return data

Пример #5

0

Показать файл

Файл: sharejs.py Проект: PegasusWang/articles

def parse_sharejs(url, html):
    kind = url.rsplit('/', 2)[1]    # kind是大分类，区别tag_list
    html = html.decode('utf-8')    # decode here
    title = extract('<h1>', '</h1>',
                    extract('<div class="post_title">', '</div>', html))
    post_content = extract('<div class="post_content" id="paragraph">',
                           '<div class="hot_tags">', html)
    if not post_content:
        post_content = extract('<div class="post_content" id="paragraph">',
                               '<div class="share">', html)

    post_content = re.sub(r'<span class="title">(.*?)</span>', '', post_content)
    content = html2markdown(post_content)
    try:
        tag_list = extract_all('">', '</a>',
                       extract('<div class="hot_tags">', '</div>', html))
    except AttributeError:
        tag_list = []

    data = {
        'kind': kind,
        'title': title,
        'source_url': url,
        'source': 'www.sharejs.com',
        'content': content,
        'tag_list': tag_list,
        'read_count': 0,
    }
    return data

Пример #6

0

Показать файл

Файл: run.py Проект: yinlinZho/noreference

    def worker(self,video):
        video_path = video.split("\\")
        video_name = video_path[len(video_path)-1][:-4]
        image_path = "G:" + os.sep + video_name + os.sep + video_name
        csv_path = "G:\\"+video_name+".csv"
        print 'thread-%d work on video %s' % (self.number, video_name)
        #extract video
        #input:video
        #output:image_path
        self.logger.info(video_name+'extract')
        extract.extract(video,image_path)

        #remove duplications
        self.logger.info(video_name+'remove')
        try:
            duplicate_list = duplication.getdelSeq(image_path[0:18])
            dirs = os.listdir(image_path[0:18])
            paths = [image_path[0:18] + os.sep + dir for dir in dirs]
            filelist = list(set(paths).difference(set(duplicate_list)))
        except(TypeError):
            dirs = os.listdir(image_path[0:18])
            paths = [image_path[0:18] + os.sep + dir for dir in dirs]
            filelist = paths

        #create threads to count mos of each image
        self.logger.info(video_name+'quality')
        image_queue = Queue.Queue()
        map(image_queue.put,filelist)
        self.logger.info('after removing:'+str(image_queue.qsize()))
        for i in range(15):
            t = quality.ThreadCounter(image_queue,csv_path)
            t.setDaemon(True)
            t.start()
        image_queue.join()

Пример #7

0

Показать файл

Файл: app.py Проект: nmonarizqa/focr

def process_file(filename):
    try:
        extract(os.path.join(app.config['UPLOAD_FOLDER'], filename+".zip"),
        os.path.join("data","delivery","000"))
        files = utils.get_tif_list()
        fname = files[0]

        scan = PhScan(fname)
        logger.info("Generating phragmites estimate...")
        #print("Generating phragmites estimate...")
        bgrn  = scan.norm
        phrag = phrag_map(bgrn)
        logger.info("Generating the clusters...")
        #print("Generating the clusters...") 
        clust = cluster_ph(scan, n_clusters=5, n_jobs=10, frac=0.05)


        ffile = os.path.join("tmp", fname.split(os.sep)[-1].replace(".TIF", "_proc.TIF"))


        if not os.path.isfile(ffile):
            logger.info("Writing processed maps to GeoTIFF {0}...".format(ffile))
            #print("Writing processed maps to GeoTIFF {0}...".format(ffile))

            write_tif(ffile, scan, phrag, clust)

        # add time to prepare files
        time.sleep(5)
        logger.info("Processing Done")
        #print("done")
        # -- decrease reference counters for arrays
        #del scan, bgrn, phrag, clust
        return render_template("process_done.html", filename=ffile.split(os.sep)[-1])
    except Exception as ex:
        return redirect(url_for('upload_file', error="There is an error in the process_file, please try again"))

Пример #8

0

Показать файл

Файл: IoOUtil.py Проект: ericye16/PyAHP

def listImgs(AP, GTS):
    '''List the images that *have* been used by the AHP.
    In other words, the images that are *not* images of opportunity.'''

    from extract import mapping, extract
    #Select the targets that produce image numbers

    #Everything labelled in mapping as IMG
    relKeys = [key for key in mapping if key[-3:] == 'IMG']
    imgs = set()
    for key in relKeys:
        dscnNum = extract(key, AP, GTS)
        if dscnNum:#if the image is there
            imgs.add(str(int(dscnNum)).zfill(4))

    #Add the other images in the pan:
    panImg = extract("First Pan IMG", AP, GTS)
    if panImg:
        numInPan = extract('PanNum', AP, GTS)
        panImg = int(panImg) #convert to integer from float
        numInPan = int(numInPan)
        for number in range(1, numInPan):
            #The first image is already there, so don't add it again.
            imgs.add(str((panImg + number) % 10000).zfill(4))

    #Add the second image of the stereo pair:
    sterImg = extract('Stereo IMG', AP, GTS)
    if sterImg:
        imgs.add(str((int(sterImg) + 1) % 10000).zfill(4))
    return imgs

Пример #9

0

Показать файл

Файл: test_extract_postgres.py Проект: vedala/chomp-etl

    def test_extract_function(self, mock_init, mock_cleanup, mock_write_batch,
                              mock_construct_fn, mock_open):

        #
        # set mock_init's return_value to None, since this method is mocking
        # a constructor and constructor is required to return None
        #
        mock_init.return_value = None

        source_type = "postgres"
        credentials = {'dbname': 'somedb', 'user': '******'}
        source_config = {'table': 'sometable', 'key2': 'somevalue'}
        extract_location = "/some/path"
        extract_filename = "a_file"
        extract.extract(source_type, credentials, source_config,
                        extract_location, extract_filename)

        #
        # verify call to open()
        #
        expected_filename_with_path = self.filename_constructed
        mock_open.assert_called_once_with(expected_filename_with_path, "w+")

        #
        # verify call to construct_function()
        #
        mock_construct_fn.assert_called_once_with(extract_location,
                                                  extract_filename)

        #
        # verify calls to write_batch()
        #
        self.assertEqual(2, mock_write_batch.call_count)

        write_batch_calls = [(1, 2), (3, 4)]
        write_batch_call_list = mock_write_batch.call_args_list

        first_call = write_batch_call_list[0]
        first_call_args, first_call_kwargs = first_call
        first_call_args_of_interest = first_call_args[1]

        second_call = write_batch_call_list[1]
        second_call_args, second_call_kwargs = second_call
        second_call_args_of_interest = second_call_args[1]

        self.assertEqual(first_call_args_of_interest, [(1, "aaa", 1000),
                                                       (2, "bbb", 2000)])
        self.assertEqual(second_call_args_of_interest, [(3, "ccc", 3000),
                                                        (4, "ddd", 4000)])

        #
        # verify call to cleanup()
        #
        mock_cleanup.assert_called_once_with()

        #
        # verify class constructor called with expected arguments
        #
        mock_init.assert_called_once_with(credentials, source_config)

Пример #10

0

Показать файл

Файл: salt_extract.py Проект: crawfordsm/zSALT

def extract_spectra(hdu, yc, dy, outfile, ext=1, minsize=5, thresh=3, grow=0, smooth=False, maskzeros=False, 
                    convert=True,  cleanspectra=True, calfile=None, clobber=True, specformat='ascii'):
    """From an image, extract a spectra.   

    """
    data=hdu[ext].data

    #replace the zeros with the average from the frame
    if maskzeros:
       mean,std=iterstat(data[data>0])
       #rdata=mean  np.random.normal(mean, std, size=data.shape)
       data[data<=0]=mean #rdata[data<=0]

    y1=yc-dy
    y2=yc+dy
    ap_list=extract(hdu, method='normal', section=[(y1,y2)], minsize=minsize, thresh=thresh, convert=convert)
    sy1a=y2
    sy2a=sy1a+2.0*dy
    ska_list=extract(hdu, method='normal', section=[(sy1a,sy2a)], minsize=minsize, thresh=thresh, convert=convert)
    sy2b=y1-dy
    sy1b=sy2b-2.0*dy
    skb_list=extract(hdu, method='normal', section=[(sy1b,sy2b)], minsize=minsize, thresh=thresh, convert=convert)
    print sy1b, sy2b

    sdata = 0.5*(ska_list[0].ldata/(sy2a-sy1a) + skb_list[0].ldata/(sy2b-sy1b))
    #sdata = ska_list[0].ldata/(sy2a-sy1a)
    #sdata = skb_list[0].ldata/(sy2b-sy1b)
    raw = 1.0 * ap_list[0].ldata
    print 'extract:', ap_list[0].ldata[1124]
    ap_list[0].ldata=ap_list[0].ldata-float(y2-y1) * sdata
    print 'sky:', ap_list[0].ldata[1124]
 
    print ap_list[0].wave[10], ap_list[0].ldata[10], ap_list[0].lvar[10]
    flux_spec=Spectrum.Spectrum(ap_list[0].wave, ap_list[0].ldata, abs(ap_list[0].lvar)**0.5, stype='continuum')
    print flux_spec.wavelength[10], flux_spec.flux[10], flux_spec.var[10]

    if cleanspectra:
       clean_spectra(ap_list[0], grow=grow)
    print 'clean:', ap_list[0].ldata[1124]

    if calfile:
           cal_spectra=st.readspectrum(calfile, error=False, ftype='ascii')
           airmass=hdu[0].header['AIRMASS']
           exptime=hdu[0].header['EXPTIME']
           extfile=os.path.dirname(st.__file__)+"/suth_extinct.dat"
           print extfile
           ext_spectra=st.readspectrum(extfile, error=False, ftype='ascii')

           flux_spec=Spectrum.Spectrum(ap_list[0].wave, ap_list[0].ldata, abs(ap_list[0].lvar)**0.5, stype='continuum')
           print flux_spec.flux[10], flux_spec.var[10]
           flux_spec=calfunc(flux_spec, cal_spectra, ext_spectra, airmass, exptime, True)
           print flux_spec.flux[10], flux_spec.var[10]
    else:
        flux_spec = Spectrum.Spectrum(ap_list[0].wave, ap_list[0].ldata, abs(ap_list[0].lvar)**0.5, stype='continuum')
    
    if specformat == 'ascii':
        write_ascii(outfile, flux_spec, clobber=clobber)
    elif specformat == 'lcogt':
        write_lcogt(outfile, flux_spec, hdu, sky=float(y2-y1) * sdata, raw = raw, clobber=clobber)

Пример #11

0

Показать файл

Файл: test_extract.py Проект: petenorth/python-answer

 def test_extract_with_by_key(self):
     self.assertEquals(
         extract.extract(
             'root/section/item2',
             '{"root": {"section": {"item1": "value1", "item2": "value2"}}}'
         ), 'value2')
     self.assertEquals(extract.extract('a/b/c', '{"a":{"b":{"c":"d"}}}'),
                       'd')

Пример #12

0

Показать файл

def top(path, f_type, hang, lie):

    f_list = readfile.readfile(path, f_type)

    for i in f_list:
        extract.extract(i, hang, lie)

    return

Пример #13

0

Показать файл

Файл: test.py Проект: pukhrajborania/RECOMMENDER-SYSTEM-FOR-E-COMMERCE-PORTAL

 def testCheckUrl(self):
   '''
     Validates the url check incorporated in the extract function
   '''
   urls_for_validation = ['google.com','https://flipkart.com/ayush','https://amazon.com']
   with self.assertRaises(NameError) as context:
       for url in urls_for_validation:
         extract(url)
   self.assertEqual(context.exception.message , 'Invalid URL given')

Пример #14

0

Показать файл

Файл: __main__.py Проект: aparnac25/extract

def main():
    "run main function on parsed args"

    # get arguments from command line as a dict-like object
    args = parse_command_line()
    pdf_path = input("Please input full pdf path and add .pdf \n") 
    # pass argument to call darwinday function
    if args.run:
        extract(pdf_path)

Пример #15

0

Показать файл

Файл: index.py Проект: ashokballolli/CloudGuruChallenge_0920

def handler(event, context):
    """
    entry point for Lambda function
    :param event: the Lambda event
    :param context: the Lambda context
    :return: None
    """

    print(f"'event': {event}")
    print(f"'context': {context}")

    # -----------------------------------------------------
    # EXTRACT

    # define ny_dataset
    ny_dataset = classes.Dataset("ny_dataset")
    ny_dataset.headers_all = ["date", "cases", "deaths"]
    ny_dataset.headers_key = ny_dataset.headers_all
    ny_dataset.match_field = "date"
    ny_dataset.source_url = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv"

    # extract and print ny_dataset
    ny_dataset.df = extract.extract(ny_dataset.source_url)
    print(f"'ny_dataset.df':\n{ny_dataset.df}")

    # define jh_dataset
    jh_dataset = classes.Dataset("jh_dataset")
    jh_dataset.headers_all = [
        "Date", "Country/Region", "Province/State", "Lat", "Long", "Confirmed",
        "Recovered", "Deaths"
    ]
    jh_dataset.headers_key = ["Date", "Country/Region", "Recovered"]
    jh_dataset.match_field = "Date"
    jh_dataset.source_url = \
        "https://raw.githubusercontent.com/datasets/covid-19/master/data/time-series-19-covid-combined.csv"

    # extract and print jh_dataset
    jh_dataset.df = extract.extract(jh_dataset.source_url,
                                    jh_dataset.headers_key, "Country/Region",
                                    "US")
    print(f"'jh_dataset.df':\n{jh_dataset.df}")

    # -----------------------------------------------------
    # TRANSFORM

    # transform the datasets into CovidStat Instances
    covid_stats = transform.transform(ny_dataset, jh_dataset)

    # print CovidStats
    print(*covid_stats, sep="\n")

    # -----------------------------------------------------
    # LOAD

    # load CovidStat instances into the CovidStats DynamoDB table
    load.load_all(classes.CovidStat, covid_stats)
    load.load_json(covid_stats)

Пример #16

0

Показать файл

Файл: spider.py Проект: tonghuashuai/sth

 def iqiyi_spider(self, url):
     ''' 爱奇艺爬虫 '''
     r = requests.get(url)
     if r.status_code == 200:
         v_id = extract('data-player-tvid="', '"', r.text)
         url_ = 'http://mixer.video.iqiyi.com/jp/mixin/videos/{v_id}'.format(v_id=v_id)
         r = requests.get(url_)
         if r.status_code == 200:
             return extract('"playCount":', ',"', r.text)

Пример #17

0

Показать файл

Файл: test.py Проект: maimang9/python-web-extractor

 def testCheckUrl(self):
     '''
    Validates the url check incorporated in the extract function
  '''
     urls_for_validation = [
         'google.com', 'https://flipkart.com/ayush', 'https://amazon.com'
     ]
     with self.assertRaises(NameError) as context:
         for url in urls_for_validation:
             extract(url)
     self.assertEqual(context.exception.message, 'Invalid URL given')

Пример #18

0

Показать файл

Файл: extract_interface.py Проект: CooolWindS/STC-Usage

def extract_interface():

    # ex. files/stego/
    stego_image_dir = gvar.directory['stego']
    mlib.check_dir(stego_image_dir)

    stego_image_folders = os.listdir(stego_image_dir)

    # ex. /home/.../pySTC/files/message_embed/R
    message_dir = gvar.directory['message_extract']
    message_dir_channel = {}
    for i in stego_image_folders:

        message_dir_channel[i] = os.path.join(message_dir, i)
        mlib.check_dir(message_dir_channel[i])

    print('In ' + str(stego_image_dir))
    print('Channel list: ' + str(stego_image_folders))
    print('Extract start...\n')

    for i in range(len(stego_image_folders)):

        # ex. files/stego/R
        stego_image_folders[i] = os.path.join(stego_image_dir,
                                              stego_image_folders[i])

        stego_image_filelist = os.listdir(stego_image_folders[i])
        stego_image_filelist.sort()

        data_size = len(stego_image_filelist)
        print(
            str(data_size) + " images to extract in " +
            str(stego_image_folders[i]))
        print("Start extracting...")

        for j in tqdm(range(int(data_size)), file=sys.stdout):

            # ex. files/stego/R\01-source-00002_stego_R.bmp
            stego_image = os.path.join(stego_image_folders[i],
                                       stego_image_filelist[j])

            # ex. 01-source-00002_stego_R
            stego_image_name = os.path.splitext(stego_image_filelist[j])[0]

            channel = stego_image_name.split('_')[-1]

            output_message_file = stego_image_name + '.txt'
            message_file = os.path.join(message_dir_channel[channel],
                                        output_message_file)
            #print(message_file)

            extract(stego_image, message_file, channel)
        print('Done.\n')

Пример #19

0

Показать файл

def scan(filepath, dirpath, log):
    extract(filepath, dirpath)
    extracted_dirpath = dirpath + '/_' + basename(filepath) + '.extracted'
    if not isdir(extracted_dirpath):
        return [(filepath, False)]
    files = listdir(extracted_dirpath)
    binary_files = [
        join(extracted_dirpath, f) for f in files
        if isfile(join(extracted_dirpath, f))
    ]
    log('Found {} embedded files in {}'.format(len(binary_files), filepath))

    return [scan_file(f, log) for f in binary_files]

Пример #20

0

Показать файл

Файл: thirdparty_download.py Проект: nitrofurano/openMSX

def extractPackage(package, tarballsDir, sourcesDir, patchesDir):
    if not isdir(sourcesDir):
        makedirs(sourcesDir)
    sourceDirName = package.getSourceDirName()
    packageSrcDir = joinpath(sourcesDir, sourceDirName)
    if isdir(packageSrcDir):
        rmtree(packageSrcDir)
    extract(joinpath(tarballsDir, package.getTarballName()), sourcesDir, TopLevelDirRenamer(sourceDirName))
    diffPath = joinpath(patchesDir, sourceDirName + ".diff")
    if isfile(diffPath):
        for diff in Diff.load(diffPath):
            patch(diff, sourcesDir)
            print "Patched:", diff.getPath()

Пример #21

0

Показать файл

def extractPackage(package, tarballsDir, sourcesDir, patchesDir):
    if not isdir(sourcesDir):
        makedirs(sourcesDir)
    sourceDirName = package.getSourceDirName()
    packageSrcDir = joinpath(sourcesDir, sourceDirName)
    if isdir(packageSrcDir):
        rmtree(packageSrcDir)
    extract(joinpath(tarballsDir, package.getTarballName()), sourcesDir,
            TopLevelDirRenamer(sourceDirName))
    diffPath = joinpath(patchesDir, sourceDirName + '.diff')
    if isfile(diffPath):
        for diff in Diff.load(diffPath):
            patch(diff, sourcesDir)
            print 'Patched:', diff.getPath()

Пример #22

0

Показать файл

Файл: parser.py Проект: iafilatov/hikstract

    def update_datadir(self, datadir):
        logger.info('Entering {}'.format(datadir))

        h_idx_file = items.IndexFile(os.path.join(self.data_root,
                                                  datadir,
                                                  self.h_index_fname))
        h_idx_file_rev = h_idx_file.header.revision

        logger.info('Index revision is {}'.format(h_idx_file_rev))

        db_dir_entry = self.db['datadirs'][datadir]

        # Skip if revision has not changed
        if db_dir_entry['revision'] == h_idx_file_rev:
            logger.info('Revision unchanged, nothing to update')
            return

        cur_sec_idx = db_dir_entry['cur_section']
        for sec in u.full_circle(h_idx_file.sections, cur_sec_idx):

            logger.debug('Entering section {}'.format(sec.idx))

            if sec.idx == cur_sec_idx:
                next_vrec_idx = db_dir_entry['last_vrec'] + 1
            else:
                next_vrec_idx = 0

            next_vrecs = u.islice_from(sec.video_records, next_vrec_idx)
            for i, vrec in enumerate(next_vrecs):
                if vrec.start_dt == datetime.utcfromtimestamp(0):
                    logger.debug(
                        'Skipping extraction of incomplete vrecat {}:{:x}'
                        .format(vrec._h_idx_file.name, vrec._pos)
                    )
                    continue
                try:
                    extract(vrec)
                    db_dir_entry['last_vrec'] = next_vrec_idx + i
                    db_dir_entry['cur_section'] = sec.idx
                    self.db['cur_datadir'] = datadir
                    self.db.save()
                except FileExistsError as e:
                    logger.info(
                        'File {} exists, will not overwrite'
                        .format(e.filename)
                    )

        logger.info('Done processing revision {}'.format(h_idx_file_rev))
        db_dir_entry['revision'] = h_idx_file_rev
        self.db.save()

Пример #23

0

Показать файл

def match(filePath, fileName):
    fn = fileName.split('.')[0]
    '''
    #生成parse和depend文件
    corpusProcess.segment(filePath+fileName, "data/"+fn+"_分词.txt")
    os.system("java -jar nlp.jar " + "data/ " + fn+"_分词.txt")
    os.remove("data/"+fn+"_分词.txt")
    corpusProcess.parse("data/"+fn+"_句法分析.txt", "data/"+fn+"_parse.txt")
    corpusProcess.depend("data/"+fn+"_依存关系.txt", "data/"+fn+"_depend.txt")
    '''
    #读取句子，parse和depend
    with open(filePath+fileName, 'r', encoding="utf8") as f:
        sentences = f.readlines()
    with open("data/"+fn+"_parse.txt", 'r', encoding="utf8") as pf:
        parseJson = pf.readlines()
    with open("data/"+fn+"_depend.txt", 'r', encoding="utf8") as df:
        dependJson = df.readlines()
    parseCommon, dependCommon = loadCommon("data/"+fn+"_parse.txt", "data/"+fn+"_depend.txt")
    #判断每句话是否符合模式
    vecPOS = []
    vecEmo = []
    vecPAD = []
    for i in range(len(sentences)):
        #是否符合关键词+词性标注模式
        if matchPOS(sentences[i]):
            vecPOS.append(1)
        else:
            vecPOS.append(0)
        #是否符合情感标注模式
        if matchEmo(sentences[i]):
            vecEmo.append(1)
        else:
            vecEmo.append(0)
        #是否符合句法+依存关系模式
        count = 0
        parse = json.loads(parseJson[i])
        for key in parse.keys():
            if key in parseCommon:
                count += 1
        depend = json.loads(dependJson[i])
        for key in depend.keys():
            if key in dependCommon:
                count += 1
        if count >= 35:
            vecPAD.append(1)
        else:
            vecPAD.append(0)
    #观点句抽取
    extract.extract(vecPOS, filePath, fileName)
    return vecPOS, vecEmo, vecPAD

Пример #24

0

Показать файл

Файл: pywintar.py Проект: tristeng/pywintar

def main():
    args = parse_args()
    try:
        if args.action == "extract":
            if args.verbose:
                print "Extracting archive"
            extract.extract(infile=args.input, outfile=args.output, verbose=args.verbose)
        elif args.action == "archive":
            if args.verbose:
                print "Creating archive"
            archive.archive(infile=args.input, compression=args.compression, outfile=args.output, verbose=args.verbose)
    except (extract.ExtractException, archive.ArchiveException) as ex:
        print >> sys.stderr, ex.msg
        return ex.code
    return 0

Пример #25

0

Показать файл

Файл: sim.py Проект: robinlin99/Dead_Reckoning_IMU

def sim2():
    resolution = 0.006
    acc = extract.extract('./B_Accelerometer_data/jog_9/sub_3.csv', "acc")
    localangular = extract.extract('./C_Gyroscope_data/jog_9/sub_3.csv',
                                   "gyro")
    acc = acc[0:min(len(acc), len(localangular))]
    localangular = localangular[0:min(len(acc), len(localangular))]
    print(len(acc))
    print(len(localangular))
    assert (len(acc) == len(localangular))
    steps = len(acc)
    new_sim = Dead_Reckoning.Dead_Reckoning(acc, localangular, resolution,
                                            steps)
    new_sim.simulate()
    new_sim.plot_traj()

Пример #26

0

Показать файл

Файл: spider.py Проект: tonghuashuai/sth

    def xinlang_spider(self, url):
        ''' 新浪论坛爬虫 '''
        r = requests.get(url)
        if r.status_code == 200:
            count = extract('<font color="#ff0000"> ', '</font>', r.text)

            return int(count.replace(',', ''))

Пример #27

0

Показать файл

Файл: opslinux.py Проект: budong/code_segment

def last():
    home = get_response(host="opslinux.com",url="/")
    content = extract_all('<article>','</article>',home)
    for item in content:
        title_html = extract('<a href="','</a>',item)
        title = title_html.split('">')
        print "标题: %s \n地址: %s\n" % (title[1],title[0])

Пример #28

0

Показать файл

Файл: TestExtract.py Проект: yorrick/download-data

    def test_line_extract_4(self):
        line = """2015-03-04 03:13:51 125.122.116.68 POST /revue/JCHA/1995/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 125.122.116.68 "" "-" 200 6387"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp, get_montreal_time(datetime(2015, 3, 4, 3, 13, 51)))
        self.assertEqual(record.proxy_ip, "125.122.116.68")
        self.assertEqual(record.http_method, "POST")

        self.assertEqual(record.user_ip, "125.122.116.68")
        self.assertEqual(record.country, "CN")
        self.assertEqual(record.continent, "AS")
        self.assertEqual(record.timezone, "Asia/Shanghai")
        self.assertEqual(record.geo_coordinates, "30.2936, 120.1614")

        self.assertEqual(record.url, "/revue/JCHA/1995/v6/n1/031091ar.pdf")

        self.assertEqual(record.raw_user_agent, "")
        self.assertEqual(record.browser, "")
        self.assertEqual(record.os, "")
        self.assertEqual(record.device_type, "")
        self.assertFalse(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 200)

Пример #29

0

Показать файл

Файл: core.py Проект: vaishnavi-gangu/-files

def allcore(inputarray,indexarray,coresize,kernel_width_total):
    core = []
    for j in indexarray:
        c = ex.extract(j,inputarray,coresize,kernel_width_total)
        core.append(c)
    
    return core

Пример #30

0

Показать файл

Файл: opslinux.py Проект: budong/code_segment

def all():
    archives = get_response(host="opslinux.com",url="/archives.html")
    content = extract_all('<article>','</article>',archives)
    for item in content:
        title_html = extract('<a href="','</a>',item)
        title = title_html.split('">')
        print "标题: %s \n地址: %s\n" % (title[1],title[0])

Пример #31

0

Показать файл

    def test_line_extract_3(self):
        line = """2015-03-04 00:29:36 222.33.68.117 GET /revue/JCHA/2015/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 222.33.68.117 "-" "-" 400 460"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp,
                         get_montreal_time(datetime(2015, 3, 4, 0, 29, 36)))
        self.assertEqual(record.proxy_ip, "222.33.68.117")
        self.assertEqual(record.http_method, "GET")

        self.assertEqual(record.user_ip, "222.33.68.117")
        self.assertEqual(record.country, "CN")
        self.assertEqual(record.continent, "AS")
        self.assertEqual(record.timezone, "Asia/Shanghai")
        self.assertEqual(record.geo_coordinates, "39.9289, 116.3883")

        self.assertEqual(record.url, "/revue/JCHA/2015/v6/n1/031091ar.pdf")

        self.assertEqual(record.raw_user_agent, "-")
        self.assertEqual(record.browser, "Other")
        self.assertEqual(record.os, "Other")
        self.assertEqual(record.device_type, "")
        self.assertFalse(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 400)

        self.assertEqual(record.age, 0)

Пример #32

0

Показать файл

    def test_line_extract_2(self):
        line = """2015-03-04 02:17:29 100.43.91.4 GET /revue/JCHA/2014/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 100.43.91.4 "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" "-" 200 6387"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp,
                         get_montreal_time(datetime(2015, 3, 4, 2, 17, 29)))
        self.assertEqual(record.proxy_ip, "100.43.91.4")
        self.assertEqual(record.http_method, "GET")
        self.assertEqual(record.url, "/revue/JCHA/2014/v6/n1/031091ar.pdf")

        self.assertEqual(record.user_ip, "100.43.91.4")
        self.assertEqual(record.country, "US")
        self.assertEqual(record.continent, "NA")
        self.assertEqual(record.timezone, "America/Los_Angeles")
        self.assertEqual(record.geo_coordinates, "37.4135, -122.1312")

        self.assertEqual(record.journal_name, "jcha")
        # self.assertEqual(record.journal_domain, "")

        self.assertEqual(
            record.raw_user_agent,
            "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)")
        self.assertEqual(record.browser, "YandexBot")
        self.assertEqual(record.os, "Other")
        self.assertEqual(record.device_type, "")
        self.assertTrue(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 200)

        self.assertEqual(record.age, 1)

Пример #33

0

Показать файл

Файл: __init__.py Проект: sirpercival/ir-reduce

def reduce_dither_pair(dither_a, dither_b, traces, trace_direction=1, 
                       lamp_image=None):
    '''dither_a and dither_b are two dither positions of the same source,
    already flat-fielded. traces is a list of initial guesses for trace parameters.
    trace_direction is 1 for a horizontal trace and 0 for a vertical trace.'''
    #p_init = composite_model(traces, model_type='gaussian')
    lamps = lamp_image != None
    pdb.set_trace()
    difference_image = im_subtract(dither_a, dither_b)[1]
    postrace, negtrace = fit_trace(difference_image, traces, 
                              tracedir=trace_direction)
    dither_a = fix_distortion(dither_a, postrace, trace_direction)
    dither_b = fix_distortion(dither_b, negtrace, trace_direction)
    difference_image = im_subtract(dither_a, dither_b)[1]
    all_profiles = fit_trace(difference_image, traces, 
                             tracedir=trace_direction)
    telluric_image = im_minimum(dither_a, dither_b)[1]
    return extract(all_profiles, difference_image, telluric_image, 
                   tracedir=trace_direction, lamps=lamps, lamp=lamp_image)

Пример #34

0

Показать файл

    def get_permanent_wechat_article_url(self, sougou_url):
        """ 从搜狗的临时url获取永久url

        Args:
            sougou_url (str): "http://mp.weixin.qq.com/s?timestamp=1473815432&src=3&ver=1&signature=puOtJfG0mefG5o6Ls-bqDmML9ZjS5S6oDIhdUReNRm6*bIF9yINfCoXvB3btXzPEeUZvV8bdlSRTgKPx5Nsd6ZfzLK4Gv4X6z7te1EEo2azG3llx*rw*fxqXrKnwP2oqTTrNYxaRzM8cARFIbjPHVLpWdZGqNhyxsKoK5ozlXSk="

        Returns:
            msg_link (str): "http://mp.weixin.qq.com/s?__biz=MzI1OTAwNDc1OA==&amp;mid=2652831837&amp;idx=1&amp;sn=3a93c0b6dfeef85e9b85bdac39f47bce&amp;chksm=f1942064c6e3a9728f0bdc4d9bab481b7079c7c1d9ed32397295b45d0b02af839dafcc4b093e#rd";

        """
        time.sleep(random.randint(1, 10))
        curl_str = """
        curl 'http://mp.weixin.qq.com/s?timestamp=1473815432&src=3&ver=1&signature=puOtJfG0mefG5o6Ls-bqDmML9ZjS5S6oDIhdUReNRm6*bIF9yINfCoXvB3btXzPEeUZvV8bdlSRTgKPx5Nsd6ZfzLK4Gv4X6z7te1EEo2azG3llx*rw*fxqXrKnwP2oqTTrNYxaRzM8cARFIbjPHVLpWdZGqNhyxsKoK5ozlXSk=' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Connection: keep-alive' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' --compressed
        """
        _, headers, _ = parse_curl_str(curl_str)
        headers['User-Agent'] = random_ua()
        r = requests.get(sougou_url)
        html = r.text
        try:
            msg_link = xhtml_unescape(extract('msg_link = "', '";', html))
        except Exception:
            self.logger.exception(html)
            msg_link = sougou_url
        self.logger.info('get permanent url: %s', msg_link)
        return msg_link

Пример #35

0

Показать файл

Файл: main.py Проект: zephraph/saasify

def stylecloud(request: StyleCloudRequest):
    params = request.dict()
    url = params.pop("url", None)
    text = params.pop("text", None)
    background_color = params.pop("background_color", None)
    gradient = params.pop("gradient", None)

    if gradient == Gradient.none:
        gradient = None

    if url is not None:
        result = extract(url)
        pprint.pprint(result)
        text = result["text"]
    elif text is None:
        raise Exception('Must provide either "text" or "url".')

    sc.gen_stylecloud(**params,
                      text=text,
                      gradient=gradient,
                      icon_dir="/tmp/icons",
                      output_name=OUTPUT_NAME,
                      background_color=background_color.as_hex())

    return FileResponse(OUTPUT_NAME, media_type="image/png", headers=headers)

Пример #36

0

Показать файл

Файл: TestExtract.py Проект: yorrick/download-data

    def test_line_extract_2(self):
        line = """2015-03-04 02:17:29 100.43.91.4 GET /revue/JCHA/2014/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 100.43.91.4 "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" "-" 200 6387"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp, get_montreal_time(datetime(2015, 3, 4, 2, 17, 29)))
        self.assertEqual(record.proxy_ip, "100.43.91.4")
        self.assertEqual(record.http_method, "GET")
        self.assertEqual(record.url, "/revue/JCHA/2014/v6/n1/031091ar.pdf")

        self.assertEqual(record.user_ip, "100.43.91.4")
        self.assertEqual(record.country, "US")
        self.assertEqual(record.continent, "NA")
        self.assertEqual(record.timezone, "America/Los_Angeles")
        self.assertEqual(record.geo_coordinates, "37.4135, -122.1312")

        self.assertEqual(record.journal_name, "jcha")
        # self.assertEqual(record.journal_domain, "")

        self.assertEqual(record.raw_user_agent, "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)")
        self.assertEqual(record.browser, "YandexBot")
        self.assertEqual(record.os, "Other")
        self.assertEqual(record.device_type, "")
        self.assertTrue(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 200)

        self.assertEqual(record.age, 1)

Пример #37

0

Показать файл

Файл: TestExtract.py Проект: yorrick/download-data

    def test_line_extract_3(self):
        line = """2015-03-04 00:29:36 222.33.68.117 GET /revue/JCHA/2015/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 222.33.68.117 "-" "-" 400 460"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp, get_montreal_time(datetime(2015, 3, 4, 0, 29, 36)))
        self.assertEqual(record.proxy_ip, "222.33.68.117")
        self.assertEqual(record.http_method, "GET")

        self.assertEqual(record.user_ip, "222.33.68.117")
        self.assertEqual(record.country, "CN")
        self.assertEqual(record.continent, "AS")
        self.assertEqual(record.timezone, "Asia/Shanghai")
        self.assertEqual(record.geo_coordinates, "39.9289, 116.3883")

        self.assertEqual(record.url, "/revue/JCHA/2015/v6/n1/031091ar.pdf")

        self.assertEqual(record.raw_user_agent, "-")
        self.assertEqual(record.browser, "Other")
        self.assertEqual(record.os, "Other")
        self.assertEqual(record.device_type, "")
        self.assertFalse(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 400)

        self.assertEqual(record.age, 0)

Пример #38

0

Показать файл

Файл: spider.py Проект: tonghuashuai/sth

 def zhidao_spider(self, url):
     ''' 百度知道爬虫 '''
     id = extract('http://zhidao.baidu.com/question/', '.html', url)
     url_ = 'http://zhidao.baidu.com/api/qbpv?q={id}'.format(id=id)
     r = requests.get(url_)
     if r.status_code == 200:
         return r.text

Пример #39

0

Показать файл

def main():
    np.random.seed(12345)

    # read in the first few time series from the TIDIGITS dataset; the return
    # value is a collection of LabeledTimeSeries (see datasets.utils). You
    # will of course need to have the relevant dataset on your machine, as
    # well as update datasets/paths.py to point to it. For TIDIGITS
    # specifically, you will also need to have librosa installed. For the
    # UCR datasets, the whichExamples argument takes this many examples from
    # all 20 datasets
    whichExamples = np.arange(2)
    tsList = datasets.loadDataset(datasets.TIDIGITS,
                                  whichExamples=whichExamples)

    # uncomment any of these to use a different dataset
    # tsList = datasets.loadDataset(datasets.DISHWASHER, whichExamples=whichExamples)
    # tsList = datasets.loadDataset(datasets.MSRC, whichExamples=whichExamples)
    # tsList = datasets.loadDataset(datasets.UCR, whichExamples=[0])

    Lmin, Lmax = 1. / 20, 1. / 10  # fractions of time series length
    for ts in tsList:
        startIdxs, endIdxs, model, featureMat, featureMatBlur = extract(
            ts.data, Lmin, Lmax)
        plotExtractOutput(ts, startIdxs, endIdxs, featureMat, model)

        # you can also call this if you just want to see what the data looks like
        # ts.plot()

        # plt.savefig(ts.name + '.pdf') # use this to save it

        plt.show()

Пример #40

0

Показать файл

Файл: __init__.py Проект: akhilgogia/georgetown-wrangle

def wrangle_reviews(path, userid=None):
    """
    For a review xml file, extracts and loads into database
    """
    userid  = userid or userid_from_path(path)
    session = create_session()

    # Get user object
    user    = User(id=userid)
    user    = session.merge(user)

    with extract(path) as reviews:
        for review in reviews:
            book = Book(**review.get_book_data())
            book = session.merge(book)

            for author in review.get_author_data():
                author = Author(**author)
                author = session.merge(author)

            for data in review.get_book_authors_data():
                book_author = BookAuthor(**data)
                book_author = session.merge(book_author)

            review = review.get_book_reviews_data()
            review.update({'user_id': userid})
            review = Review(**review)
            review = session.merge(review)

    session.commit()
    session.close()

Пример #41

0

Показать файл

def extract_and_report(argv, html=False, matching_only=True):
    """Extract all credit card numbers from a list of plain text files
    and produce a report.
    @see: L{BincodesDB.fetch}
    @type argv: list(str)
    @type html: bool
    @type matching_only: bool
    @param argv: List of filenames, glob wildcards, or the special value "-".
        See: L{extract.listfiles}
    @param html: C{True} for an HTML report, C{False} for a plain text report.
    @param matching_only: C{True} to show only credit cards that match known
        bincodes, C{False} to show all credit cards.
    @rtype: iterator of (str, Table)
    @return: Yields tuples with the filename and the report for that file.
    """
    found = set()
    bincodes = BincodesDB()
    try:
        for filename in listfiles(argv):
            if filename != '-':
                data = open(filename, 'r').read()
            else:
                data = sys.stdin.read()
            table = Table(html, header_row)
            for cc in extract(data):
                if cc not in found:
                    row = list(bincodes.fetch(cc))
                    if not matching_only or row[1] is not None:
                        table.add_row(row)
                    found.add(cc)
            yield (filename, table)
    finally:
        bincodes.close()

Пример #42

0

Показать файл

Файл: app.py Проект: shemogumbe/extract_registrations

def rear():
    if request.method == 'POST':
        extraction = extract.extract(request.files['file'], 'rear')
        write('rear', extraction['data'])
        return extraction

    return render_template('rear.html')

Пример #43

0

Показать файл

Файл: update.py Проект: wbrxcorp/oscar

def update_file(base_dir, uuid, real_path):
    hasher = hashlib.sha1()
    try:
        with open(real_path, "rb") as afile:
            stat = os.fstat(afile.fileno())
            size = stat.st_size
            mtime = stat.st_mtime
            buf = afile.read(blocksize)
            while len(buf) > 0:
                hasher.update(buf)
                buf = afile.read(blocksize)
    except IOError:# ファイルが絶妙なタイミングで削除されたなど
        logging.exception("calculating hash") 
        with oscar.context(base_dir, oscar.min_free_blocks) as context:
            delete.delete_by_uuid(context, uuid)

    row = {"_key":uuid, "size":size, "mtime":mtime, "dirty":False}
    hashval = hasher.hexdigest()

    extracted_content = None
    if fulltext_already_exists(base_dir, hashval):
        #logging.debug("Fulltext already exists %s" % hashval)
        row["fulltext"] = hashval
    else:
        try:
            if size <= fulltext_max_file_size: # ファイルサイズが規定値以下の場合に限りfulltextをextractする
                extracted_content = extract.extract(real_path)
        except Exception, e: # 多様なフォーマットを扱うためどういう例外が起こるかまるでわからん
            log.create_log(base_dir, "extract", u"%s (%s): %s" % (real_path.decode("utf-8"), hashval, e.message.decode("utf-8")))

Пример #44

0

Показать файл

Файл: app.py Проект: shemogumbe/extract_registrations

def front():
    if request.method == 'POST':
        extraction = extract.extract(request.files['file'], 'front')
        write('front', extraction['data'])
        return extraction

    return render_template('front.html')

Пример #45

0

Показать файл

def wrangle_reviews(path, userid=None):
    """
    For a review xml file, extracts and loads into database
    """
    userid = userid or userid_from_path(path)
    session = create_session()

    # Get user object
    user = User(id=userid)
    user = session.merge(user)

    with extract(path) as reviews:
        for review in reviews:
            book = Book(**review.get_book_data())
            book = session.merge(book)

            for author in review.get_author_data():
                author = Author(**author)
                author = session.merge(author)

            for data in review.get_book_authors_data():
                book_author = BookAuthor(**data)
                book_author = session.merge(book_author)

            review = review.get_book_reviews_data()
            review.update({'user_id': userid})
            review = Review(**review)
            review = session.merge(review)

    session.commit()
    session.close()

Пример #46

0

Показать файл

def img_to_txt(filename=''):
    if filename == '':  #default image
        img = cv2.imread('./static/ku.jpg')
    else:
        print('not using deafult bro')
        img = cv2.imread('.' + filename)
        print('.' + filename)

    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    #show histogram
    # plt.hist(img.ravel(), 256, [0, 256])
    # plt.show()

    retval, img = cv2.threshold(img, 100, 255,
                                cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # cv2.imshow('img',img)
    # cv2.waitKey(0)
    # cv2.destroyAllWindows()
    cv2.imwrite('./static/processed_image.jpg', img)
    text = pytesseract.image_to_string(img)

    data = extract(text)
    return data

Пример #47

0

Показать файл

Файл: main.py Проект: icdm-extract/extract

def main():
	np.random.seed(12345)

	# read in the first few time series from the TIDIGITS dataset; the return
	# value is a collection of LabeledTimeSeries (see datasets.utils). You
	# will of course need to have the relevant dataset on your machine, as
	# well as update datasets/paths.py to point to it. For TIDIGITS
	# specifically, you will also need to have librosa installed. For the
	# UCR datasets, the whichExamples argument takes this many examples from
	# all 20 datasets
	whichExamples = np.arange(2)
	tsList = datasets.loadDataset(datasets.TIDIGITS, whichExamples=whichExamples)

	# uncomment any of these to use a different dataset
	# tsList = datasets.loadDataset(datasets.DISHWASHER, whichExamples=whichExamples)
	# tsList = datasets.loadDataset(datasets.MSRC, whichExamples=whichExamples)
	# tsList = datasets.loadDataset(datasets.UCR, whichExamples=[0])

	Lmin, Lmax = 1./20, 1./10 # fractions of time series length
	for ts in tsList:
		startIdxs, endIdxs, model, featureMat, featureMatBlur = extract(
			ts.data, Lmin, Lmax)
		plotExtractOutput(ts, startIdxs, endIdxs, featureMat, model)

		# you can also call this if you just want to see what the data looks like
		# ts.plot()

		# plt.savefig(ts.name + '.pdf') # use this to save it

		plt.show()

Пример #48

0

Показать файл

Файл: GenerateFeatures.py Проект: JessMcintosh/SonoGestures

def generate_features(featureGenerator, directory):
    originalPath = os.path.dirname(sys.argv[0])
    os.getcwd()
    os.chdir(directory)
    print 'generating features for dir:', directory

    programPath = os.path.join(originalPath,"build", featureGenerator)
    featuresDir = os.path.join(directory, featureGenerator.replace(".exe","") + "/")

    if not os.path.exists(featuresDir):
        os.mkdir(featuresDir)
    for i in os.listdir(os.getcwd()):
        if i in gesturesAll:
            gestureDir = os.path.join(featuresDir, i)
            if not os.path.exists(gestureDir):
                os.mkdir(gestureDir)
            print i
            os.chdir(directory + "/" + i)
            #for dataFile in os.listdir(os.getcwd()):
            for dataFile in glob.glob(os.path.join(os.getcwd(),'*.avi')):
                realpath = os.path.realpath(dataFile)
                result = extract.extract(realpath, programPath)
                basename =os.path.splitext((os.path.basename(dataFile)))[0] + ".txt"
                outfile = os.path.join(gestureDir, basename)
                print outfile
                try:
                    os.remove(outfile)
                except OSError:
                    pass
                f = open(outfile, 'w')
                f.write(result)
                f.close()
                
    os.chdir(originalPath)

Пример #49

0

Показать файл

Файл: test_slmc.py Проект: Two222/slmc

def test_extract():
    """
    :Author: Tim Hoer
    :Date: November 20, 2017
    :Notes: Tests that function loads all images from input directory and
    stores them as instances of the lesion class.
    """
    import os
    import urllib.request
    import shutil
    import tempfile
    from extract import extract
    from Image import Image
    # create temporary directory
    test_dir = tempfile.mkdtemp()
    #test extract on empty directory
    #assertRaises(Exception,extract,test_dir)
    # upload images to temporary directory
    fullfilename = os.path.join(test_dir, 'puppy.jpg')
    urllib.request.urlretrieve(
        "http://www.zarias.com/wp-content/uploads/2015/12/61-cute-puppies.jpg",
        fullfilename)
    fullfilename = os.path.join(test_dir, 'kitten.jpg')
    urllib.request.urlretrieve(
        "http://weknowyourdreams.com/images/kittens/kittens-02.jpg",
        fullfilename)
    # call function
    out = extract(test_dir)
    # check that output array is instance of lesion class
    assert (len(out) == 2)
    assert (isinstance(out[0], Image) is True)
    # remove temporary directory
    shutil.rmtree(test_dir)

Пример #50

0

Показать файл

    def test_line_extract_4(self):
        line = """2015-03-04 03:13:51 125.122.116.68 POST /revue/JCHA/1995/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 125.122.116.68 "" "-" 200 6387"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp,
                         get_montreal_time(datetime(2015, 3, 4, 3, 13, 51)))
        self.assertEqual(record.proxy_ip, "125.122.116.68")
        self.assertEqual(record.http_method, "POST")

        self.assertEqual(record.user_ip, "125.122.116.68")
        self.assertEqual(record.country, "CN")
        self.assertEqual(record.continent, "AS")
        self.assertEqual(record.timezone, "Asia/Shanghai")
        self.assertEqual(record.geo_coordinates, "30.2936, 120.1614")

        self.assertEqual(record.url, "/revue/JCHA/1995/v6/n1/031091ar.pdf")

        self.assertEqual(record.raw_user_agent, "")
        self.assertEqual(record.browser, "")
        self.assertEqual(record.os, "")
        self.assertEqual(record.device_type, "")
        self.assertFalse(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 200)

Пример #51

0

Показать файл

Файл: test.py Проект: pukhrajborania/RECOMMENDER-SYSTEM-FOR-E-COMMERCE-PORTAL

 def testCheckFlipkart(self):
   '''
     The extract function should return a dict on execution with 
     urls having the xpath in the website_csv list
   '''
   item_name = "Brica Pop Open Cling Sun Shade"
   item_name_extracted = str(extract('http://www.flipkart.com/brica-pop-open-cling-sun-shade/p/itme2znucyhn7un2?pid=SUDE2ZNUDUFMJ36M&srno=b_1&offer=DOTDOnAutomotive_Jan21.&ref=75afa3e4-e5b7-425a-92f1-745b4f6b7f99')['name']).replace("\n","").replace(" ","")
   self.assertEqual(item_name.replace(" ","") , item_name_extracted)

Пример #52

0

Показать файл

Файл: extract_simple.py Проект: scraperdragon/spreadsheet-upload-tool

def test_it_saves_to_the_database():
    sheets = extract.validate(extract.extract('fixture/simple.xlsx'))
    extract.save(sheets)

    data = scraperwiki.sql.select('* from Sheet1')
    row = data[2]
    assert_equals(row['Year'], 2012)
    assert_equals(row['Awesomeness'], 8)

Пример #53

0

Показать файл

Файл: unicode_csv.py Проект: tunde023/spreadsheet-upload-tool

def test_it_saves_a_unicode_csv_to_the_database():
    sheets = extract.validate(extract.extract('fixture/mps_unicode.csv'))
    extract.save(sheets)

    data = scraperwiki.sql.select('* from swdata')
    row = data[460]
    assert_equals(row['MP Name'], 'Michelle Gildernew')
    assert_equals(row['Party'], u'Sinn Féin')

Пример #54

0

Показать файл

Файл: unicode_csv.py Проект: chandankrsinha/spreadsheet-upload-tool

def test_it_saves_a_unicode_csv_to_the_database():
    sheets = extract.validate(extract.extract('fixture/mps_unicode.csv'))
    extract.save(sheets)

    data = scraperwiki.sql.select('* from swdata')
    row = data[460]
    assert_equals(row['MP Name'], 'Michelle Gildernew')
    assert_equals(row['Party'], u'Sinn Féin')

Пример #55

0

Показать файл

def get_all_tag_urls(url='http://www.sharejs.com/codes/'):
    html = requests.get(url).content.decode('utf-8')
    tag_urls = extract_all('<a href="', '"',
                           extract('<div class="tags_cloud">', '</ul>', html))
    base_url = 'http://www.sharejs.com%s'
    tag_urls = [base_url % i for i in tag_urls]
    tag_urls = [i + '?start=0' for i in tag_urls]
    return tag_urls

Пример #56

0

Показать файл

Файл: test.py Проект: mistergiri/python-web-extractor

 def testCheckFlipkart(self):
   '''
     The extract function should return a dict on execution with 
     urls having the xpath in the website_csv list
   '''
   item_name = "Scullers Men's Checkered Casual Shirt"
   item_name_extracted = str(extract('http://www.flipkart.com/scullers-men-s-checkered-casual-shirt/p/itmduvc4fpgtktkf?pid=SHTDUJF6XSSNB92T&srno=b_1&ref=884be278-844c-4a29-b300-b0c131dfddb0')['name']).replace("\n","").replace(" ","")
   self.assertEqual(item_name.replace(" ","") , item_name_extracted)

Пример #57

0

Показать файл

Файл: sharejs.py Проект: PegasusWang/articles

def get_all_tag_urls(url='http://www.sharejs.com/codes/'):
    html = requests.get(url).content.decode('utf-8')
    tag_urls = extract_all('<a href="', '"',
                           extract('<div class="tags_cloud">', '</ul>', html))
    base_url = 'http://www.sharejs.com%s'
    tag_urls = [base_url % i for i in tag_urls]
    tag_urls = [i + '?start=0' for i in tag_urls]
    return tag_urls

Пример #58

0

Показать файл

Файл: unicode_csv.py Проект: chandankrsinha/spreadsheet-upload-tool

def test_it_can_extract_a_unicode_csv():
    sheets = extract.validate(extract.extract('fixture/mps_unicode.csv'))
    assert_equals(len(sheets), 1)

    sheet = sheets['swdata']
    assert_equals(len(sheet), 653)
    row = sheet[460]
    assert_equals(row['MP Name'], 'Michelle Gildernew')
    assert_equals(row['Party'], u'Sinn Féin')

Пример #59

0

Показать файл

Файл: sharejs.py Проект: PegasusWang/articles

 def handle_html(self, url, html):
     html = html.decode('utf-8')
     url_list = extract_all('<a href="', '"',
                        extract('<div class="code_list">', '</ul>', html))
     article_list = [i for i in url_list if 'author' not in i]
     base_url = 'http://www.sharejs.com'
     article_list = [base_url+i for i in article_list]
     article_list.pop(0)
     self.results.extend(article_list)

Пример #60

0

Показать файл

Файл: wrapper.py Проект: rebeccamorgan/easyskim

def textChanger(pdfText, mostAuthor="", mostPaper="",extractOptions=["nltk",5,5,5],devMode=False):
    """"Takes the semi-cleaned text of a pdf and extracts the desired portions. Output in markdown suitable for display on the website."""
    pdfText = pre_clean.pre_clean(pdfText)
    if mostAuthor:
        mostAuthor = evaluator(authorCounter(pdfText))
    if mostPaper:
        mostPaper = evaluator(paperCounter(pdfText))
    ex = extract(pdfText,extractOptions)
    return ex

Python extract примеры использования