Пример #1
0
def listImgs(AP, GTS):
    '''List the images that *have* been used by the AHP.
    In other words, the images that are *not* images of opportunity.'''

    from extract import mapping, extract
    #Select the targets that produce image numbers

    #Everything labelled in mapping as IMG
    relKeys = [key for key in mapping if key[-3:] == 'IMG']
    imgs = set()
    for key in relKeys:
        dscnNum = extract(key, AP, GTS)
        if dscnNum:  #if the image is there
            imgs.add(str(int(dscnNum)).zfill(4))

    #Add the other images in the pan:
    panImg = extract("First Pan IMG", AP, GTS)
    if panImg:
        numInPan = extract('PanNum', AP, GTS)
        panImg = int(panImg)  #convert to integer from float
        numInPan = int(numInPan)
        for number in range(1, numInPan):
            #The first image is already there, so don't add it again.
            imgs.add(str((panImg + number) % 10000).zfill(4))

    #Add the second image of the stereo pair:
    sterImg = extract('Stereo IMG', AP, GTS)
    if sterImg:
        imgs.add(str((int(sterImg) + 1) % 10000).zfill(4))
    return imgs
Пример #2
0
def checkupdates():
        logging.info("Reading from file")
        f=open('docid.txt')
        lists=[]
        lists=map(int,f)
        docid=lists[0]
        f.close()
        logging.info("Reading complete.")
        logging.info("Starting to check for update")
        url= 'http://hib.iiit-bh.ac.in/Hibiscus/Pub/nbDocDet.php?docid={}&client=iiit&iframe=true&nb=Y'.format(docid)
        logging.info("Trying to fetch d url")
        resp = urllib2.urlopen(url)
        respData = resp.read()
        logging.info("Fetching complete.")
        regex='<h1 style="BACKGROUND-COLOR: white; line-height: 2em; margin:0 .5em .2em .5em; padding: 4px 8px 4px 8px; border-radius: 10px;-moz-border-radius: 10px; -webkit-border-radius: 10px; border: 1px solid silver;text-decoration:none; font-size: 2.1em;">(.*?)</h1>'
        pattern =re.compile(regex)
        header=re.findall(pattern,respData)
        logging.info("Got the header")
        if not header:
            logging.info("No new notice found")
            pass
        else:
            logging.info("Got a new notice")
            logging.info("Writing to file")
            docid=docid+1
            f=open('docid.txt','w')
            f.write(str(docid))
            f.close()
            logging.info("Writing  complete.")
            try:
                logging.info("sending html to extract")
                extract.extract(respData,header[0],url)
            except Exception as e:
                logging.error("Calling extract failed %s",e)      
Пример #3
0
def main(args):
    model = utils.get_models(bert_config=args.bert_config,
                             pred_n_labels=args.pred_n_labels,
                             arg_n_labels=args.arg_n_labels,
                             n_arg_heads=args.n_arg_heads,
                             n_arg_layers=args.n_arg_layers,
                             pos_emb_dim=args.pos_emb_dim,
                             use_lstm=args.use_lstm,
                             device=args.device)
    if torch.cuda.is_available():
        map_location = lambda storage, loc: storage.cuda()
    else:
        map_location = 'cpu'
    model.load_state_dict(
        torch.load(args.model_path, map_location=map_location))
    model.zero_grad()
    model.eval()

    loader = load_data(data_path=args.test_data_path,
                       batch_size=args.batch_size,
                       tokenizer_config=args.bert_config,
                       train=False)
    start = time.time()
    extract(args, model, loader, args.save_path)
    print("TIME: ", time.time() - start)
    test_results = do_eval(args.save_path, args.test_gold_path)
    utils.print_results("TEST RESULT", test_results,
                        ["F1  ", "PREC", "REC ", "AUC "])
Пример #4
0
def parse_sharejs(url, html):
    kind = url.rsplit('/', 2)[1]  # kind是大分类,区别tag_list
    html = html.decode('utf-8')  # decode here
    title = extract('<h1>', '</h1>',
                    extract('<div class="post_title">', '</div>', html))
    post_content = extract('<div class="post_content" id="paragraph">',
                           '<div class="hot_tags">', html)
    if not post_content:
        post_content = extract('<div class="post_content" id="paragraph">',
                               '<div class="share">', html)

    post_content = re.sub(r'<span class="title">(.*?)</span>', '',
                          post_content)
    content = html2markdown(post_content)
    try:
        tag_list = extract_all(
            '">', '</a>', extract('<div class="hot_tags">', '</div>', html))
    except AttributeError:
        tag_list = []

    data = {
        'kind': kind,
        'title': title,
        'source_url': url,
        'source': 'www.sharejs.com',
        'content': content,
        'tag_list': tag_list,
        'read_count': 0,
    }
    return data
Пример #5
0
def parse_sharejs(url, html):
    kind = url.rsplit('/', 2)[1]    # kind是大分类,区别tag_list
    html = html.decode('utf-8')    # decode here
    title = extract('<h1>', '</h1>',
                    extract('<div class="post_title">', '</div>', html))
    post_content = extract('<div class="post_content" id="paragraph">',
                           '<div class="hot_tags">', html)
    if not post_content:
        post_content = extract('<div class="post_content" id="paragraph">',
                               '<div class="share">', html)

    post_content = re.sub(r'<span class="title">(.*?)</span>', '', post_content)
    content = html2markdown(post_content)
    try:
        tag_list = extract_all('">', '</a>',
                       extract('<div class="hot_tags">', '</div>', html))
    except AttributeError:
        tag_list = []

    data = {
        'kind': kind,
        'title': title,
        'source_url': url,
        'source': 'www.sharejs.com',
        'content': content,
        'tag_list': tag_list,
        'read_count': 0,
    }
    return data
Пример #6
0
    def worker(self,video):
        video_path = video.split("\\")
        video_name = video_path[len(video_path)-1][:-4]
        image_path = "G:" + os.sep + video_name + os.sep + video_name
        csv_path = "G:\\"+video_name+".csv"
        print 'thread-%d work on video %s' % (self.number, video_name)
        #extract video
        #input:video
        #output:image_path
        self.logger.info(video_name+'extract')
        extract.extract(video,image_path)

        #remove duplications
        self.logger.info(video_name+'remove')
        try:
            duplicate_list = duplication.getdelSeq(image_path[0:18])
            dirs = os.listdir(image_path[0:18])
            paths = [image_path[0:18] + os.sep + dir for dir in dirs]
            filelist = list(set(paths).difference(set(duplicate_list)))
        except(TypeError):
            dirs = os.listdir(image_path[0:18])
            paths = [image_path[0:18] + os.sep + dir for dir in dirs]
            filelist = paths

        #create threads to count mos of each image
        self.logger.info(video_name+'quality')
        image_queue = Queue.Queue()
        map(image_queue.put,filelist)
        self.logger.info('after removing:'+str(image_queue.qsize()))
        for i in range(15):
            t = quality.ThreadCounter(image_queue,csv_path)
            t.setDaemon(True)
            t.start()
        image_queue.join()
Пример #7
0
def process_file(filename):
    try:
        extract(os.path.join(app.config['UPLOAD_FOLDER'], filename+".zip"),
        os.path.join("data","delivery","000"))
        files = utils.get_tif_list()
        fname = files[0]

        scan = PhScan(fname)
        logger.info("Generating phragmites estimate...")
        #print("Generating phragmites estimate...")
        bgrn  = scan.norm
        phrag = phrag_map(bgrn)
        logger.info("Generating the clusters...")
        #print("Generating the clusters...") 
        clust = cluster_ph(scan, n_clusters=5, n_jobs=10, frac=0.05)


        ffile = os.path.join("tmp", fname.split(os.sep)[-1].replace(".TIF", "_proc.TIF"))


        if not os.path.isfile(ffile):
            logger.info("Writing processed maps to GeoTIFF {0}...".format(ffile))
            #print("Writing processed maps to GeoTIFF {0}...".format(ffile))

            write_tif(ffile, scan, phrag, clust)

        # add time to prepare files
        time.sleep(5)
        logger.info("Processing Done")
        #print("done")
        # -- decrease reference counters for arrays
        #del scan, bgrn, phrag, clust
        return render_template("process_done.html", filename=ffile.split(os.sep)[-1])
    except Exception as ex:
        return redirect(url_for('upload_file', error="There is an error in the process_file, please try again"))
Пример #8
0
def listImgs(AP, GTS):
    '''List the images that *have* been used by the AHP.
    In other words, the images that are *not* images of opportunity.'''

    from extract import mapping, extract
    #Select the targets that produce image numbers

    #Everything labelled in mapping as IMG
    relKeys = [key for key in mapping if key[-3:] == 'IMG']
    imgs = set()
    for key in relKeys:
        dscnNum = extract(key, AP, GTS)
        if dscnNum:#if the image is there
            imgs.add(str(int(dscnNum)).zfill(4))

    #Add the other images in the pan:
    panImg = extract("First Pan IMG", AP, GTS)
    if panImg:
        numInPan = extract('PanNum', AP, GTS)
        panImg = int(panImg) #convert to integer from float
        numInPan = int(numInPan)
        for number in range(1, numInPan):
            #The first image is already there, so don't add it again.
            imgs.add(str((panImg + number) % 10000).zfill(4))

    #Add the second image of the stereo pair:
    sterImg = extract('Stereo IMG', AP, GTS)
    if sterImg:
        imgs.add(str((int(sterImg) + 1) % 10000).zfill(4))
    return imgs
Пример #9
0
    def test_extract_function(self, mock_init, mock_cleanup, mock_write_batch,
                              mock_construct_fn, mock_open):

        #
        # set mock_init's return_value to None, since this method is mocking
        # a constructor and constructor is required to return None
        #
        mock_init.return_value = None

        source_type = "postgres"
        credentials = {'dbname': 'somedb', 'user': '******'}
        source_config = {'table': 'sometable', 'key2': 'somevalue'}
        extract_location = "/some/path"
        extract_filename = "a_file"
        extract.extract(source_type, credentials, source_config,
                        extract_location, extract_filename)

        #
        # verify call to open()
        #
        expected_filename_with_path = self.filename_constructed
        mock_open.assert_called_once_with(expected_filename_with_path, "w+")

        #
        # verify call to construct_function()
        #
        mock_construct_fn.assert_called_once_with(extract_location,
                                                  extract_filename)

        #
        # verify calls to write_batch()
        #
        self.assertEqual(2, mock_write_batch.call_count)

        write_batch_calls = [(1, 2), (3, 4)]
        write_batch_call_list = mock_write_batch.call_args_list

        first_call = write_batch_call_list[0]
        first_call_args, first_call_kwargs = first_call
        first_call_args_of_interest = first_call_args[1]

        second_call = write_batch_call_list[1]
        second_call_args, second_call_kwargs = second_call
        second_call_args_of_interest = second_call_args[1]

        self.assertEqual(first_call_args_of_interest, [(1, "aaa", 1000),
                                                       (2, "bbb", 2000)])
        self.assertEqual(second_call_args_of_interest, [(3, "ccc", 3000),
                                                        (4, "ddd", 4000)])

        #
        # verify call to cleanup()
        #
        mock_cleanup.assert_called_once_with()

        #
        # verify class constructor called with expected arguments
        #
        mock_init.assert_called_once_with(credentials, source_config)
Пример #10
0
def extract_spectra(hdu, yc, dy, outfile, ext=1, minsize=5, thresh=3, grow=0, smooth=False, maskzeros=False, 
                    convert=True,  cleanspectra=True, calfile=None, clobber=True, specformat='ascii'):
    """From an image, extract a spectra.   

    """
    data=hdu[ext].data

    #replace the zeros with the average from the frame
    if maskzeros:
       mean,std=iterstat(data[data>0])
       #rdata=mean  np.random.normal(mean, std, size=data.shape)
       data[data<=0]=mean #rdata[data<=0]

    y1=yc-dy
    y2=yc+dy
    ap_list=extract(hdu, method='normal', section=[(y1,y2)], minsize=minsize, thresh=thresh, convert=convert)
    sy1a=y2
    sy2a=sy1a+2.0*dy
    ska_list=extract(hdu, method='normal', section=[(sy1a,sy2a)], minsize=minsize, thresh=thresh, convert=convert)
    sy2b=y1-dy
    sy1b=sy2b-2.0*dy
    skb_list=extract(hdu, method='normal', section=[(sy1b,sy2b)], minsize=minsize, thresh=thresh, convert=convert)
    print sy1b, sy2b

    sdata = 0.5*(ska_list[0].ldata/(sy2a-sy1a) + skb_list[0].ldata/(sy2b-sy1b))
    #sdata = ska_list[0].ldata/(sy2a-sy1a)
    #sdata = skb_list[0].ldata/(sy2b-sy1b)
    raw = 1.0 * ap_list[0].ldata
    print 'extract:', ap_list[0].ldata[1124]
    ap_list[0].ldata=ap_list[0].ldata-float(y2-y1) * sdata
    print 'sky:', ap_list[0].ldata[1124]
 
    print ap_list[0].wave[10], ap_list[0].ldata[10], ap_list[0].lvar[10]
    flux_spec=Spectrum.Spectrum(ap_list[0].wave, ap_list[0].ldata, abs(ap_list[0].lvar)**0.5, stype='continuum')
    print flux_spec.wavelength[10], flux_spec.flux[10], flux_spec.var[10]

    if cleanspectra:
       clean_spectra(ap_list[0], grow=grow)
    print 'clean:', ap_list[0].ldata[1124]

    if calfile:
           cal_spectra=st.readspectrum(calfile, error=False, ftype='ascii')
           airmass=hdu[0].header['AIRMASS']
           exptime=hdu[0].header['EXPTIME']
           extfile=os.path.dirname(st.__file__)+"/suth_extinct.dat"
           print extfile
           ext_spectra=st.readspectrum(extfile, error=False, ftype='ascii')

           flux_spec=Spectrum.Spectrum(ap_list[0].wave, ap_list[0].ldata, abs(ap_list[0].lvar)**0.5, stype='continuum')
           print flux_spec.flux[10], flux_spec.var[10]
           flux_spec=calfunc(flux_spec, cal_spectra, ext_spectra, airmass, exptime, True)
           print flux_spec.flux[10], flux_spec.var[10]
    else:
        flux_spec = Spectrum.Spectrum(ap_list[0].wave, ap_list[0].ldata, abs(ap_list[0].lvar)**0.5, stype='continuum')
    
    if specformat == 'ascii':
        write_ascii(outfile, flux_spec, clobber=clobber)
    elif specformat == 'lcogt':
        write_lcogt(outfile, flux_spec, hdu, sky=float(y2-y1) * sdata, raw = raw, clobber=clobber)
Пример #11
0
 def test_extract_with_by_key(self):
     self.assertEquals(
         extract.extract(
             'root/section/item2',
             '{"root": {"section": {"item1": "value1", "item2": "value2"}}}'
         ), 'value2')
     self.assertEquals(extract.extract('a/b/c', '{"a":{"b":{"c":"d"}}}'),
                       'd')
Пример #12
0
def top(path, f_type, hang, lie):

    f_list = readfile.readfile(path, f_type)

    for i in f_list:
        extract.extract(i, hang, lie)

    return
 def testCheckUrl(self):
   '''
     Validates the url check incorporated in the extract function
   '''
   urls_for_validation = ['google.com','https://flipkart.com/ayush','https://amazon.com']
   with self.assertRaises(NameError) as context:
       for url in urls_for_validation:
         extract(url)
   self.assertEqual(context.exception.message , 'Invalid URL given')
Пример #14
0
def main():
    "run main function on parsed args"

    # get arguments from command line as a dict-like object
    args = parse_command_line()
    pdf_path = input("Please input full pdf path and add .pdf \n") 
    # pass argument to call darwinday function
    if args.run:
        extract(pdf_path)
Пример #15
0
def handler(event, context):
    """
    entry point for Lambda function
    :param event: the Lambda event
    :param context: the Lambda context
    :return: None
    """

    print(f"'event': {event}")
    print(f"'context': {context}")

    # -----------------------------------------------------
    # EXTRACT

    # define ny_dataset
    ny_dataset = classes.Dataset("ny_dataset")
    ny_dataset.headers_all = ["date", "cases", "deaths"]
    ny_dataset.headers_key = ny_dataset.headers_all
    ny_dataset.match_field = "date"
    ny_dataset.source_url = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv"

    # extract and print ny_dataset
    ny_dataset.df = extract.extract(ny_dataset.source_url)
    print(f"'ny_dataset.df':\n{ny_dataset.df}")

    # define jh_dataset
    jh_dataset = classes.Dataset("jh_dataset")
    jh_dataset.headers_all = [
        "Date", "Country/Region", "Province/State", "Lat", "Long", "Confirmed",
        "Recovered", "Deaths"
    ]
    jh_dataset.headers_key = ["Date", "Country/Region", "Recovered"]
    jh_dataset.match_field = "Date"
    jh_dataset.source_url = \
        "https://raw.githubusercontent.com/datasets/covid-19/master/data/time-series-19-covid-combined.csv"

    # extract and print jh_dataset
    jh_dataset.df = extract.extract(jh_dataset.source_url,
                                    jh_dataset.headers_key, "Country/Region",
                                    "US")
    print(f"'jh_dataset.df':\n{jh_dataset.df}")

    # -----------------------------------------------------
    # TRANSFORM

    # transform the datasets into CovidStat Instances
    covid_stats = transform.transform(ny_dataset, jh_dataset)

    # print CovidStats
    print(*covid_stats, sep="\n")

    # -----------------------------------------------------
    # LOAD

    # load CovidStat instances into the CovidStats DynamoDB table
    load.load_all(classes.CovidStat, covid_stats)
    load.load_json(covid_stats)
Пример #16
0
 def iqiyi_spider(self, url):
     ''' 爱奇艺爬虫 '''
     r = requests.get(url)
     if r.status_code == 200:
         v_id = extract('data-player-tvid="', '"', r.text)
         url_ = 'http://mixer.video.iqiyi.com/jp/mixin/videos/{v_id}'.format(v_id=v_id)
         r = requests.get(url_)
         if r.status_code == 200:
             return extract('"playCount":', ',"', r.text)
Пример #17
0
 def testCheckUrl(self):
     '''
    Validates the url check incorporated in the extract function
  '''
     urls_for_validation = [
         'google.com', 'https://flipkart.com/ayush', 'https://amazon.com'
     ]
     with self.assertRaises(NameError) as context:
         for url in urls_for_validation:
             extract(url)
     self.assertEqual(context.exception.message, 'Invalid URL given')
Пример #18
0
def extract_interface():

    # ex. files/stego/
    stego_image_dir = gvar.directory['stego']
    mlib.check_dir(stego_image_dir)

    stego_image_folders = os.listdir(stego_image_dir)

    # ex. /home/.../pySTC/files/message_embed/R
    message_dir = gvar.directory['message_extract']
    message_dir_channel = {}
    for i in stego_image_folders:

        message_dir_channel[i] = os.path.join(message_dir, i)
        mlib.check_dir(message_dir_channel[i])

    print('In ' + str(stego_image_dir))
    print('Channel list: ' + str(stego_image_folders))
    print('Extract start...\n')

    for i in range(len(stego_image_folders)):

        # ex. files/stego/R
        stego_image_folders[i] = os.path.join(stego_image_dir,
                                              stego_image_folders[i])

        stego_image_filelist = os.listdir(stego_image_folders[i])
        stego_image_filelist.sort()

        data_size = len(stego_image_filelist)
        print(
            str(data_size) + " images to extract in " +
            str(stego_image_folders[i]))
        print("Start extracting...")

        for j in tqdm(range(int(data_size)), file=sys.stdout):

            # ex. files/stego/R\01-source-00002_stego_R.bmp
            stego_image = os.path.join(stego_image_folders[i],
                                       stego_image_filelist[j])

            # ex. 01-source-00002_stego_R
            stego_image_name = os.path.splitext(stego_image_filelist[j])[0]

            channel = stego_image_name.split('_')[-1]

            output_message_file = stego_image_name + '.txt'
            message_file = os.path.join(message_dir_channel[channel],
                                        output_message_file)
            #print(message_file)

            extract(stego_image, message_file, channel)
        print('Done.\n')
Пример #19
0
def scan(filepath, dirpath, log):
    extract(filepath, dirpath)
    extracted_dirpath = dirpath + '/_' + basename(filepath) + '.extracted'
    if not isdir(extracted_dirpath):
        return [(filepath, False)]
    files = listdir(extracted_dirpath)
    binary_files = [
        join(extracted_dirpath, f) for f in files
        if isfile(join(extracted_dirpath, f))
    ]
    log('Found {} embedded files in {}'.format(len(binary_files), filepath))

    return [scan_file(f, log) for f in binary_files]
Пример #20
0
def extractPackage(package, tarballsDir, sourcesDir, patchesDir):
    if not isdir(sourcesDir):
        makedirs(sourcesDir)
    sourceDirName = package.getSourceDirName()
    packageSrcDir = joinpath(sourcesDir, sourceDirName)
    if isdir(packageSrcDir):
        rmtree(packageSrcDir)
    extract(joinpath(tarballsDir, package.getTarballName()), sourcesDir, TopLevelDirRenamer(sourceDirName))
    diffPath = joinpath(patchesDir, sourceDirName + ".diff")
    if isfile(diffPath):
        for diff in Diff.load(diffPath):
            patch(diff, sourcesDir)
            print "Patched:", diff.getPath()
Пример #21
0
def extractPackage(package, tarballsDir, sourcesDir, patchesDir):
    if not isdir(sourcesDir):
        makedirs(sourcesDir)
    sourceDirName = package.getSourceDirName()
    packageSrcDir = joinpath(sourcesDir, sourceDirName)
    if isdir(packageSrcDir):
        rmtree(packageSrcDir)
    extract(joinpath(tarballsDir, package.getTarballName()), sourcesDir,
            TopLevelDirRenamer(sourceDirName))
    diffPath = joinpath(patchesDir, sourceDirName + '.diff')
    if isfile(diffPath):
        for diff in Diff.load(diffPath):
            patch(diff, sourcesDir)
            print 'Patched:', diff.getPath()
Пример #22
0
    def update_datadir(self, datadir):
        logger.info('Entering {}'.format(datadir))

        h_idx_file = items.IndexFile(os.path.join(self.data_root,
                                                  datadir,
                                                  self.h_index_fname))
        h_idx_file_rev = h_idx_file.header.revision

        logger.info('Index revision is {}'.format(h_idx_file_rev))

        db_dir_entry = self.db['datadirs'][datadir]

        # Skip if revision has not changed
        if db_dir_entry['revision'] == h_idx_file_rev:
            logger.info('Revision unchanged, nothing to update')
            return

        cur_sec_idx = db_dir_entry['cur_section']
        for sec in u.full_circle(h_idx_file.sections, cur_sec_idx):

            logger.debug('Entering section {}'.format(sec.idx))

            if sec.idx == cur_sec_idx:
                next_vrec_idx = db_dir_entry['last_vrec'] + 1
            else:
                next_vrec_idx = 0

            next_vrecs = u.islice_from(sec.video_records, next_vrec_idx)
            for i, vrec in enumerate(next_vrecs):
                if vrec.start_dt == datetime.utcfromtimestamp(0):
                    logger.debug(
                        'Skipping extraction of incomplete vrecat {}:{:x}'
                        .format(vrec._h_idx_file.name, vrec._pos)
                    )
                    continue
                try:
                    extract(vrec)
                    db_dir_entry['last_vrec'] = next_vrec_idx + i
                    db_dir_entry['cur_section'] = sec.idx
                    self.db['cur_datadir'] = datadir
                    self.db.save()
                except FileExistsError as e:
                    logger.info(
                        'File {} exists, will not overwrite'
                        .format(e.filename)
                    )

        logger.info('Done processing revision {}'.format(h_idx_file_rev))
        db_dir_entry['revision'] = h_idx_file_rev
        self.db.save()
Пример #23
0
def match(filePath, fileName):
    fn = fileName.split('.')[0]
    '''
    #生成parse和depend文件
    corpusProcess.segment(filePath+fileName, "data/"+fn+"_分词.txt")
    os.system("java -jar nlp.jar " + "data/ " + fn+"_分词.txt")
    os.remove("data/"+fn+"_分词.txt")
    corpusProcess.parse("data/"+fn+"_句法分析.txt", "data/"+fn+"_parse.txt")
    corpusProcess.depend("data/"+fn+"_依存关系.txt", "data/"+fn+"_depend.txt")
    '''
    #读取句子,parse和depend
    with open(filePath+fileName, 'r', encoding="utf8") as f:
        sentences = f.readlines()
    with open("data/"+fn+"_parse.txt", 'r', encoding="utf8") as pf:
        parseJson = pf.readlines()
    with open("data/"+fn+"_depend.txt", 'r', encoding="utf8") as df:
        dependJson = df.readlines()
    parseCommon, dependCommon = loadCommon("data/"+fn+"_parse.txt", "data/"+fn+"_depend.txt")
    #判断每句话是否符合模式
    vecPOS = []
    vecEmo = []
    vecPAD = []
    for i in range(len(sentences)):
        #是否符合关键词+词性标注模式
        if matchPOS(sentences[i]):
            vecPOS.append(1)
        else:
            vecPOS.append(0)
        #是否符合情感标注模式
        if matchEmo(sentences[i]):
            vecEmo.append(1)
        else:
            vecEmo.append(0)
        #是否符合句法+依存关系模式
        count = 0
        parse = json.loads(parseJson[i])
        for key in parse.keys():
            if key in parseCommon:
                count += 1
        depend = json.loads(dependJson[i])
        for key in depend.keys():
            if key in dependCommon:
                count += 1
        if count >= 35:
            vecPAD.append(1)
        else:
            vecPAD.append(0)
    #观点句抽取
    extract.extract(vecPOS, filePath, fileName)
    return vecPOS, vecEmo, vecPAD
Пример #24
0
def main():
    args = parse_args()
    try:
        if args.action == "extract":
            if args.verbose:
                print "Extracting archive"
            extract.extract(infile=args.input, outfile=args.output, verbose=args.verbose)
        elif args.action == "archive":
            if args.verbose:
                print "Creating archive"
            archive.archive(infile=args.input, compression=args.compression, outfile=args.output, verbose=args.verbose)
    except (extract.ExtractException, archive.ArchiveException) as ex:
        print >> sys.stderr, ex.msg
        return ex.code
    return 0
Пример #25
0
def sim2():
    resolution = 0.006
    acc = extract.extract('./B_Accelerometer_data/jog_9/sub_3.csv', "acc")
    localangular = extract.extract('./C_Gyroscope_data/jog_9/sub_3.csv',
                                   "gyro")
    acc = acc[0:min(len(acc), len(localangular))]
    localangular = localangular[0:min(len(acc), len(localangular))]
    print(len(acc))
    print(len(localangular))
    assert (len(acc) == len(localangular))
    steps = len(acc)
    new_sim = Dead_Reckoning.Dead_Reckoning(acc, localangular, resolution,
                                            steps)
    new_sim.simulate()
    new_sim.plot_traj()
Пример #26
0
    def xinlang_spider(self, url):
        ''' 新浪论坛爬虫 '''
        r = requests.get(url)
        if r.status_code == 200:
            count = extract('<font color="#ff0000"> ', '</font>', r.text)

            return int(count.replace(',', ''))
Пример #27
0
def last():
    home = get_response(host="opslinux.com",url="/")
    content = extract_all('<article>','</article>',home)
    for item in content:
        title_html = extract('<a href="','</a>',item)
        title = title_html.split('">')
        print "标题: %s \n地址: %s\n" % (title[1],title[0])
Пример #28
0
    def test_line_extract_4(self):
        line = """2015-03-04 03:13:51 125.122.116.68 POST /revue/JCHA/1995/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 125.122.116.68 "" "-" 200 6387"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp, get_montreal_time(datetime(2015, 3, 4, 3, 13, 51)))
        self.assertEqual(record.proxy_ip, "125.122.116.68")
        self.assertEqual(record.http_method, "POST")

        self.assertEqual(record.user_ip, "125.122.116.68")
        self.assertEqual(record.country, "CN")
        self.assertEqual(record.continent, "AS")
        self.assertEqual(record.timezone, "Asia/Shanghai")
        self.assertEqual(record.geo_coordinates, "30.2936, 120.1614")

        self.assertEqual(record.url, "/revue/JCHA/1995/v6/n1/031091ar.pdf")

        self.assertEqual(record.raw_user_agent, "")
        self.assertEqual(record.browser, "")
        self.assertEqual(record.os, "")
        self.assertEqual(record.device_type, "")
        self.assertFalse(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 200)
Пример #29
0
def allcore(inputarray,indexarray,coresize,kernel_width_total):
    core = []
    for j in indexarray:
        c = ex.extract(j,inputarray,coresize,kernel_width_total)
        core.append(c)
    
    return core
Пример #30
0
def all():
    archives = get_response(host="opslinux.com",url="/archives.html")
    content = extract_all('<article>','</article>',archives)
    for item in content:
        title_html = extract('<a href="','</a>',item)
        title = title_html.split('">')
        print "标题: %s \n地址: %s\n" % (title[1],title[0])
Пример #31
0
    def test_line_extract_3(self):
        line = """2015-03-04 00:29:36 222.33.68.117 GET /revue/JCHA/2015/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 222.33.68.117 "-" "-" 400 460"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp,
                         get_montreal_time(datetime(2015, 3, 4, 0, 29, 36)))
        self.assertEqual(record.proxy_ip, "222.33.68.117")
        self.assertEqual(record.http_method, "GET")

        self.assertEqual(record.user_ip, "222.33.68.117")
        self.assertEqual(record.country, "CN")
        self.assertEqual(record.continent, "AS")
        self.assertEqual(record.timezone, "Asia/Shanghai")
        self.assertEqual(record.geo_coordinates, "39.9289, 116.3883")

        self.assertEqual(record.url, "/revue/JCHA/2015/v6/n1/031091ar.pdf")

        self.assertEqual(record.raw_user_agent, "-")
        self.assertEqual(record.browser, "Other")
        self.assertEqual(record.os, "Other")
        self.assertEqual(record.device_type, "")
        self.assertFalse(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 400)

        self.assertEqual(record.age, 0)
Пример #32
0
    def test_line_extract_2(self):
        line = """2015-03-04 02:17:29 100.43.91.4 GET /revue/JCHA/2014/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 100.43.91.4 "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" "-" 200 6387"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp,
                         get_montreal_time(datetime(2015, 3, 4, 2, 17, 29)))
        self.assertEqual(record.proxy_ip, "100.43.91.4")
        self.assertEqual(record.http_method, "GET")
        self.assertEqual(record.url, "/revue/JCHA/2014/v6/n1/031091ar.pdf")

        self.assertEqual(record.user_ip, "100.43.91.4")
        self.assertEqual(record.country, "US")
        self.assertEqual(record.continent, "NA")
        self.assertEqual(record.timezone, "America/Los_Angeles")
        self.assertEqual(record.geo_coordinates, "37.4135, -122.1312")

        self.assertEqual(record.journal_name, "jcha")
        # self.assertEqual(record.journal_domain, "")

        self.assertEqual(
            record.raw_user_agent,
            "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)")
        self.assertEqual(record.browser, "YandexBot")
        self.assertEqual(record.os, "Other")
        self.assertEqual(record.device_type, "")
        self.assertTrue(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 200)

        self.assertEqual(record.age, 1)
Пример #33
0
def reduce_dither_pair(dither_a, dither_b, traces, trace_direction=1, 
                       lamp_image=None):
    '''dither_a and dither_b are two dither positions of the same source,
    already flat-fielded. traces is a list of initial guesses for trace parameters.
    trace_direction is 1 for a horizontal trace and 0 for a vertical trace.'''
    #p_init = composite_model(traces, model_type='gaussian')
    lamps = lamp_image != None
    pdb.set_trace()
    difference_image = im_subtract(dither_a, dither_b)[1]
    postrace, negtrace = fit_trace(difference_image, traces, 
                              tracedir=trace_direction)
    dither_a = fix_distortion(dither_a, postrace, trace_direction)
    dither_b = fix_distortion(dither_b, negtrace, trace_direction)
    difference_image = im_subtract(dither_a, dither_b)[1]
    all_profiles = fit_trace(difference_image, traces, 
                             tracedir=trace_direction)
    telluric_image = im_minimum(dither_a, dither_b)[1]
    return extract(all_profiles, difference_image, telluric_image, 
                   tracedir=trace_direction, lamps=lamps, lamp=lamp_image)
    
    
    
    
    
    
    
    
    
Пример #34
0
    def get_permanent_wechat_article_url(self, sougou_url):
        """ 从搜狗的临时url获取永久url

        Args:
            sougou_url (str): "http://mp.weixin.qq.com/s?timestamp=1473815432&src=3&ver=1&signature=puOtJfG0mefG5o6Ls-bqDmML9ZjS5S6oDIhdUReNRm6*bIF9yINfCoXvB3btXzPEeUZvV8bdlSRTgKPx5Nsd6ZfzLK4Gv4X6z7te1EEo2azG3llx*rw*fxqXrKnwP2oqTTrNYxaRzM8cARFIbjPHVLpWdZGqNhyxsKoK5ozlXSk="

        Returns:
            msg_link (str): "http://mp.weixin.qq.com/s?__biz=MzI1OTAwNDc1OA==&amp;mid=2652831837&amp;idx=1&amp;sn=3a93c0b6dfeef85e9b85bdac39f47bce&amp;chksm=f1942064c6e3a9728f0bdc4d9bab481b7079c7c1d9ed32397295b45d0b02af839dafcc4b093e#rd";

        """
        time.sleep(random.randint(1, 10))
        curl_str = """
        curl 'http://mp.weixin.qq.com/s?timestamp=1473815432&src=3&ver=1&signature=puOtJfG0mefG5o6Ls-bqDmML9ZjS5S6oDIhdUReNRm6*bIF9yINfCoXvB3btXzPEeUZvV8bdlSRTgKPx5Nsd6ZfzLK4Gv4X6z7te1EEo2azG3llx*rw*fxqXrKnwP2oqTTrNYxaRzM8cARFIbjPHVLpWdZGqNhyxsKoK5ozlXSk=' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Connection: keep-alive' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' --compressed
        """
        _, headers, _ = parse_curl_str(curl_str)
        headers['User-Agent'] = random_ua()
        r = requests.get(sougou_url)
        html = r.text
        try:
            msg_link = xhtml_unescape(extract('msg_link = "', '";', html))
        except Exception:
            self.logger.exception(html)
            msg_link = sougou_url
        self.logger.info('get permanent url: %s', msg_link)
        return msg_link
Пример #35
0
def stylecloud(request: StyleCloudRequest):
    params = request.dict()
    url = params.pop("url", None)
    text = params.pop("text", None)
    background_color = params.pop("background_color", None)
    gradient = params.pop("gradient", None)

    if gradient == Gradient.none:
        gradient = None

    if url is not None:
        result = extract(url)
        pprint.pprint(result)
        text = result["text"]
    elif text is None:
        raise Exception('Must provide either "text" or "url".')

    sc.gen_stylecloud(**params,
                      text=text,
                      gradient=gradient,
                      icon_dir="/tmp/icons",
                      output_name=OUTPUT_NAME,
                      background_color=background_color.as_hex())

    return FileResponse(OUTPUT_NAME, media_type="image/png", headers=headers)
Пример #36
0
    def test_line_extract_2(self):
        line = """2015-03-04 02:17:29 100.43.91.4 GET /revue/JCHA/2014/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 100.43.91.4 "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" "-" 200 6387"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp, get_montreal_time(datetime(2015, 3, 4, 2, 17, 29)))
        self.assertEqual(record.proxy_ip, "100.43.91.4")
        self.assertEqual(record.http_method, "GET")
        self.assertEqual(record.url, "/revue/JCHA/2014/v6/n1/031091ar.pdf")

        self.assertEqual(record.user_ip, "100.43.91.4")
        self.assertEqual(record.country, "US")
        self.assertEqual(record.continent, "NA")
        self.assertEqual(record.timezone, "America/Los_Angeles")
        self.assertEqual(record.geo_coordinates, "37.4135, -122.1312")

        self.assertEqual(record.journal_name, "jcha")
        # self.assertEqual(record.journal_domain, "")

        self.assertEqual(record.raw_user_agent, "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)")
        self.assertEqual(record.browser, "YandexBot")
        self.assertEqual(record.os, "Other")
        self.assertEqual(record.device_type, "")
        self.assertTrue(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 200)

        self.assertEqual(record.age, 1)
Пример #37
0
    def test_line_extract_3(self):
        line = """2015-03-04 00:29:36 222.33.68.117 GET /revue/JCHA/2015/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 222.33.68.117 "-" "-" 400 460"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp, get_montreal_time(datetime(2015, 3, 4, 0, 29, 36)))
        self.assertEqual(record.proxy_ip, "222.33.68.117")
        self.assertEqual(record.http_method, "GET")

        self.assertEqual(record.user_ip, "222.33.68.117")
        self.assertEqual(record.country, "CN")
        self.assertEqual(record.continent, "AS")
        self.assertEqual(record.timezone, "Asia/Shanghai")
        self.assertEqual(record.geo_coordinates, "39.9289, 116.3883")

        self.assertEqual(record.url, "/revue/JCHA/2015/v6/n1/031091ar.pdf")

        self.assertEqual(record.raw_user_agent, "-")
        self.assertEqual(record.browser, "Other")
        self.assertEqual(record.os, "Other")
        self.assertEqual(record.device_type, "")
        self.assertFalse(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 400)

        self.assertEqual(record.age, 0)
Пример #38
0
 def zhidao_spider(self, url):
     ''' 百度知道爬虫 '''
     id = extract('http://zhidao.baidu.com/question/', '.html', url)
     url_ = 'http://zhidao.baidu.com/api/qbpv?q={id}'.format(id=id)
     r = requests.get(url_)
     if r.status_code == 200:
         return r.text
Пример #39
0
def main():
    np.random.seed(12345)

    # read in the first few time series from the TIDIGITS dataset; the return
    # value is a collection of LabeledTimeSeries (see datasets.utils). You
    # will of course need to have the relevant dataset on your machine, as
    # well as update datasets/paths.py to point to it. For TIDIGITS
    # specifically, you will also need to have librosa installed. For the
    # UCR datasets, the whichExamples argument takes this many examples from
    # all 20 datasets
    whichExamples = np.arange(2)
    tsList = datasets.loadDataset(datasets.TIDIGITS,
                                  whichExamples=whichExamples)

    # uncomment any of these to use a different dataset
    # tsList = datasets.loadDataset(datasets.DISHWASHER, whichExamples=whichExamples)
    # tsList = datasets.loadDataset(datasets.MSRC, whichExamples=whichExamples)
    # tsList = datasets.loadDataset(datasets.UCR, whichExamples=[0])

    Lmin, Lmax = 1. / 20, 1. / 10  # fractions of time series length
    for ts in tsList:
        startIdxs, endIdxs, model, featureMat, featureMatBlur = extract(
            ts.data, Lmin, Lmax)
        plotExtractOutput(ts, startIdxs, endIdxs, featureMat, model)

        # you can also call this if you just want to see what the data looks like
        # ts.plot()

        # plt.savefig(ts.name + '.pdf') # use this to save it

        plt.show()
Пример #40
0
def wrangle_reviews(path, userid=None):
    """
    For a review xml file, extracts and loads into database
    """
    userid  = userid or userid_from_path(path)
    session = create_session()

    # Get user object
    user    = User(id=userid)
    user    = session.merge(user)

    with extract(path) as reviews:
        for review in reviews:
            book = Book(**review.get_book_data())
            book = session.merge(book)

            for author in review.get_author_data():
                author = Author(**author)
                author = session.merge(author)

            for data in review.get_book_authors_data():
                book_author = BookAuthor(**data)
                book_author = session.merge(book_author)

            review = review.get_book_reviews_data()
            review.update({'user_id': userid})
            review = Review(**review)
            review = session.merge(review)

    session.commit()
    session.close()
Пример #41
0
def extract_and_report(argv, html=False, matching_only=True):
    """Extract all credit card numbers from a list of plain text files
    and produce a report.
    @see: L{BincodesDB.fetch}
    @type argv: list(str)
    @type html: bool
    @type matching_only: bool
    @param argv: List of filenames, glob wildcards, or the special value "-".
        See: L{extract.listfiles}
    @param html: C{True} for an HTML report, C{False} for a plain text report.
    @param matching_only: C{True} to show only credit cards that match known
        bincodes, C{False} to show all credit cards.
    @rtype: iterator of (str, Table)
    @return: Yields tuples with the filename and the report for that file.
    """
    found = set()
    bincodes = BincodesDB()
    try:
        for filename in listfiles(argv):
            if filename != '-':
                data = open(filename, 'r').read()
            else:
                data = sys.stdin.read()
            table = Table(html, header_row)
            for cc in extract(data):
                if cc not in found:
                    row = list(bincodes.fetch(cc))
                    if not matching_only or row[1] is not None:
                        table.add_row(row)
                    found.add(cc)
            yield (filename, table)
    finally:
        bincodes.close()
Пример #42
0
def rear():
    if request.method == 'POST':
        extraction = extract.extract(request.files['file'], 'rear')
        write('rear', extraction['data'])
        return extraction

    return render_template('rear.html')
Пример #43
0
def update_file(base_dir, uuid, real_path):
    hasher = hashlib.sha1()
    try:
        with open(real_path, "rb") as afile:
            stat = os.fstat(afile.fileno())
            size = stat.st_size
            mtime = stat.st_mtime
            buf = afile.read(blocksize)
            while len(buf) > 0:
                hasher.update(buf)
                buf = afile.read(blocksize)
    except IOError:# ファイルが絶妙なタイミングで削除されたなど
        logging.exception("calculating hash") 
        with oscar.context(base_dir, oscar.min_free_blocks) as context:
            delete.delete_by_uuid(context, uuid)

    row = {"_key":uuid, "size":size, "mtime":mtime, "dirty":False}
    hashval = hasher.hexdigest()

    extracted_content = None
    if fulltext_already_exists(base_dir, hashval):
        #logging.debug("Fulltext already exists %s" % hashval)
        row["fulltext"] = hashval
    else:
        try:
            if size <= fulltext_max_file_size: # ファイルサイズが規定値以下の場合に限りfulltextをextractする
                extracted_content = extract.extract(real_path)
        except Exception, e: # 多様なフォーマットを扱うためどういう例外が起こるかまるでわからん
            log.create_log(base_dir, "extract", u"%s (%s): %s" % (real_path.decode("utf-8"), hashval, e.message.decode("utf-8")))
Пример #44
0
def front():
    if request.method == 'POST':
        extraction = extract.extract(request.files['file'], 'front')
        write('front', extraction['data'])
        return extraction

    return render_template('front.html')
Пример #45
0
def wrangle_reviews(path, userid=None):
    """
    For a review xml file, extracts and loads into database
    """
    userid = userid or userid_from_path(path)
    session = create_session()

    # Get user object
    user = User(id=userid)
    user = session.merge(user)

    with extract(path) as reviews:
        for review in reviews:
            book = Book(**review.get_book_data())
            book = session.merge(book)

            for author in review.get_author_data():
                author = Author(**author)
                author = session.merge(author)

            for data in review.get_book_authors_data():
                book_author = BookAuthor(**data)
                book_author = session.merge(book_author)

            review = review.get_book_reviews_data()
            review.update({'user_id': userid})
            review = Review(**review)
            review = session.merge(review)

    session.commit()
    session.close()
Пример #46
0
def img_to_txt(filename=''):
    if filename == '':  #default image
        img = cv2.imread('./static/ku.jpg')
    else:
        print('not using deafult bro')
        img = cv2.imread('.' + filename)
        print('.' + filename)

    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    #show histogram
    # plt.hist(img.ravel(), 256, [0, 256])
    # plt.show()

    retval, img = cv2.threshold(img, 100, 255,
                                cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # cv2.imshow('img',img)
    # cv2.waitKey(0)
    # cv2.destroyAllWindows()
    cv2.imwrite('./static/processed_image.jpg', img)
    text = pytesseract.image_to_string(img)

    data = extract(text)
    return data
Пример #47
0
def main():
	np.random.seed(12345)

	# read in the first few time series from the TIDIGITS dataset; the return
	# value is a collection of LabeledTimeSeries (see datasets.utils). You
	# will of course need to have the relevant dataset on your machine, as
	# well as update datasets/paths.py to point to it. For TIDIGITS
	# specifically, you will also need to have librosa installed. For the
	# UCR datasets, the whichExamples argument takes this many examples from
	# all 20 datasets
	whichExamples = np.arange(2)
	tsList = datasets.loadDataset(datasets.TIDIGITS, whichExamples=whichExamples)

	# uncomment any of these to use a different dataset
	# tsList = datasets.loadDataset(datasets.DISHWASHER, whichExamples=whichExamples)
	# tsList = datasets.loadDataset(datasets.MSRC, whichExamples=whichExamples)
	# tsList = datasets.loadDataset(datasets.UCR, whichExamples=[0])

	Lmin, Lmax = 1./20, 1./10 # fractions of time series length
	for ts in tsList:
		startIdxs, endIdxs, model, featureMat, featureMatBlur = extract(
			ts.data, Lmin, Lmax)
		plotExtractOutput(ts, startIdxs, endIdxs, featureMat, model)

		# you can also call this if you just want to see what the data looks like
		# ts.plot()

		# plt.savefig(ts.name + '.pdf') # use this to save it

		plt.show()
Пример #48
0
def generate_features(featureGenerator, directory):
    originalPath = os.path.dirname(sys.argv[0])
    os.getcwd()
    os.chdir(directory)
    print 'generating features for dir:', directory

    programPath = os.path.join(originalPath,"build", featureGenerator)
    featuresDir = os.path.join(directory, featureGenerator.replace(".exe","") + "/")

    if not os.path.exists(featuresDir):
        os.mkdir(featuresDir)
    for i in os.listdir(os.getcwd()):
        if i in gesturesAll:
            gestureDir = os.path.join(featuresDir, i)
            if not os.path.exists(gestureDir):
                os.mkdir(gestureDir)
            print i
            os.chdir(directory + "/" + i)
            #for dataFile in os.listdir(os.getcwd()):
            for dataFile in glob.glob(os.path.join(os.getcwd(),'*.avi')):
                realpath = os.path.realpath(dataFile)
                result = extract.extract(realpath, programPath)
                basename =os.path.splitext((os.path.basename(dataFile)))[0] + ".txt"
                outfile = os.path.join(gestureDir, basename)
                print outfile
                try:
                    os.remove(outfile)
                except OSError:
                    pass
                f = open(outfile, 'w')
                f.write(result)
                f.close()
                
    os.chdir(originalPath)
Пример #49
0
def test_extract():
    """
    :Author: Tim Hoer
    :Date: November 20, 2017
    :Notes: Tests that function loads all images from input directory and
    stores them as instances of the lesion class.
    """
    import os
    import urllib.request
    import shutil
    import tempfile
    from extract import extract
    from Image import Image
    # create temporary directory
    test_dir = tempfile.mkdtemp()
    #test extract on empty directory
    #assertRaises(Exception,extract,test_dir)
    # upload images to temporary directory
    fullfilename = os.path.join(test_dir, 'puppy.jpg')
    urllib.request.urlretrieve(
        "http://www.zarias.com/wp-content/uploads/2015/12/61-cute-puppies.jpg",
        fullfilename)
    fullfilename = os.path.join(test_dir, 'kitten.jpg')
    urllib.request.urlretrieve(
        "http://weknowyourdreams.com/images/kittens/kittens-02.jpg",
        fullfilename)
    # call function
    out = extract(test_dir)
    # check that output array is instance of lesion class
    assert (len(out) == 2)
    assert (isinstance(out[0], Image) is True)
    # remove temporary directory
    shutil.rmtree(test_dir)
Пример #50
0
    def test_line_extract_4(self):
        line = """2015-03-04 03:13:51 125.122.116.68 POST /revue/JCHA/1995/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 125.122.116.68 "" "-" 200 6387"""
        record = extract(line, JournalReferential([]))
        self.assertIsNotNone(record)

        self.assertEqual(record.timestamp,
                         get_montreal_time(datetime(2015, 3, 4, 3, 13, 51)))
        self.assertEqual(record.proxy_ip, "125.122.116.68")
        self.assertEqual(record.http_method, "POST")

        self.assertEqual(record.user_ip, "125.122.116.68")
        self.assertEqual(record.country, "CN")
        self.assertEqual(record.continent, "AS")
        self.assertEqual(record.timezone, "Asia/Shanghai")
        self.assertEqual(record.geo_coordinates, "30.2936, 120.1614")

        self.assertEqual(record.url, "/revue/JCHA/1995/v6/n1/031091ar.pdf")

        self.assertEqual(record.raw_user_agent, "")
        self.assertEqual(record.browser, "")
        self.assertEqual(record.os, "")
        self.assertEqual(record.device_type, "")
        self.assertFalse(record.is_good_robot)

        self.assertEqual(record.referer, "")
        self.assertEqual(record.http_response_code, 200)
 def testCheckFlipkart(self):
   '''
     The extract function should return a dict on execution with 
     urls having the xpath in the website_csv list
   '''
   item_name = "Brica Pop Open Cling Sun Shade"
   item_name_extracted = str(extract('http://www.flipkart.com/brica-pop-open-cling-sun-shade/p/itme2znucyhn7un2?pid=SUDE2ZNUDUFMJ36M&srno=b_1&offer=DOTDOnAutomotive_Jan21.&ref=75afa3e4-e5b7-425a-92f1-745b4f6b7f99')['name']).replace("\n","").replace(" ","")
   self.assertEqual(item_name.replace(" ","") , item_name_extracted)
def test_it_saves_to_the_database():
    sheets = extract.validate(extract.extract('fixture/simple.xlsx'))
    extract.save(sheets)

    data = scraperwiki.sql.select('* from Sheet1')
    row = data[2]
    assert_equals(row['Year'], 2012)
    assert_equals(row['Awesomeness'], 8)
def test_it_saves_a_unicode_csv_to_the_database():
    sheets = extract.validate(extract.extract('fixture/mps_unicode.csv'))
    extract.save(sheets)

    data = scraperwiki.sql.select('* from swdata')
    row = data[460]
    assert_equals(row['MP Name'], 'Michelle Gildernew')
    assert_equals(row['Party'], u'Sinn Féin')
def test_it_saves_a_unicode_csv_to_the_database():
    sheets = extract.validate(extract.extract('fixture/mps_unicode.csv'))
    extract.save(sheets)

    data = scraperwiki.sql.select('* from swdata')
    row = data[460]
    assert_equals(row['MP Name'], 'Michelle Gildernew')
    assert_equals(row['Party'], u'Sinn Féin')
Пример #55
0
def get_all_tag_urls(url='http://www.sharejs.com/codes/'):
    html = requests.get(url).content.decode('utf-8')
    tag_urls = extract_all('<a href="', '"',
                           extract('<div class="tags_cloud">', '</ul>', html))
    base_url = 'http://www.sharejs.com%s'
    tag_urls = [base_url % i for i in tag_urls]
    tag_urls = [i + '?start=0' for i in tag_urls]
    return tag_urls
Пример #56
0
 def testCheckFlipkart(self):
   '''
     The extract function should return a dict on execution with 
     urls having the xpath in the website_csv list
   '''
   item_name = "Scullers Men's Checkered Casual Shirt"
   item_name_extracted = str(extract('http://www.flipkart.com/scullers-men-s-checkered-casual-shirt/p/itmduvc4fpgtktkf?pid=SHTDUJF6XSSNB92T&srno=b_1&ref=884be278-844c-4a29-b300-b0c131dfddb0')['name']).replace("\n","").replace(" ","")
   self.assertEqual(item_name.replace(" ","") , item_name_extracted)
Пример #57
0
def get_all_tag_urls(url='http://www.sharejs.com/codes/'):
    html = requests.get(url).content.decode('utf-8')
    tag_urls = extract_all('<a href="', '"',
                           extract('<div class="tags_cloud">', '</ul>', html))
    base_url = 'http://www.sharejs.com%s'
    tag_urls = [base_url % i for i in tag_urls]
    tag_urls = [i + '?start=0' for i in tag_urls]
    return tag_urls
def test_it_can_extract_a_unicode_csv():
    sheets = extract.validate(extract.extract('fixture/mps_unicode.csv'))
    assert_equals(len(sheets), 1)

    sheet = sheets['swdata']
    assert_equals(len(sheet), 653)
    row = sheet[460]
    assert_equals(row['MP Name'], 'Michelle Gildernew')
    assert_equals(row['Party'], u'Sinn Féin')
Пример #59
0
 def handle_html(self, url, html):
     html = html.decode('utf-8')
     url_list = extract_all('<a href="', '"',
                        extract('<div class="code_list">', '</ul>', html))
     article_list = [i for i in url_list if 'author' not in i]
     base_url = 'http://www.sharejs.com'
     article_list = [base_url+i for i in article_list]
     article_list.pop(0)
     self.results.extend(article_list)
Пример #60
0
def textChanger(pdfText, mostAuthor="", mostPaper="",extractOptions=["nltk",5,5,5],devMode=False):
    """"Takes the semi-cleaned text of a pdf and extracts the desired portions. Output in markdown suitable for display on the website."""
    pdfText = pre_clean.pre_clean(pdfText)
    if mostAuthor:
        mostAuthor = evaluator(authorCounter(pdfText))
    if mostPaper:
        mostPaper = evaluator(paperCounter(pdfText))
    ex = extract(pdfText,extractOptions)
    return ex