Exemplo n.º 1
0
def expand_videos(msid, video):
    gc_data = metadata(msid)  # cached on first hit
    gc_id_str = ", ".join(gc_data.keys())

    v_id = video['id']
    ensure(v_id in gc_data,
           "glencoe doesn't know %r, only %r" % (v_id, gc_id_str))

    video_data = gc_data[v_id]
    video_data = utils.subdict(video_data, ['jpg_href', 'width', 'height'])
    video_data = utils.renkeys(video_data, [('jpg_href', 'image')])

    func = lambda mtype: {
        'mediaType': SOURCES[mtype],
        'uri': gc_data[v_id][mtype + "_href"]
    }
    video_data['sources'] = lmap(func, SOURCES)
    video.update(video_data)

    del video['uri']  # returned by elife-tools, not part of spec

    # Add placeholder, the video thumbnail image
    video["placeholder"] = {}
    video["placeholder"]["uri"] = video["image"].split('/')[-1]
    video["placeholder"]["alt"] = ""

    return video
Exemplo n.º 2
0
def expand_location(path):
    # if isinstance(path, file):
    if is_file(path):
        path = path.name

    if path.startswith('https://s3-external-1.amazonaws.com/') or path.startswith('https://s3.amazonaws.com/'):
        # it's being downloaded from a bucket, no worries
        return path

    # resolve any symlinks
    # the backfill uses symlinks to the article-xml dir
    path = os.path.abspath(os.path.realpath(path))

    if re.match(r".*article-xml/articles/.+\.xml$", path):
        # this article is coming from the local ./article-xml/ directory, which
        # is almost certainly a git checkout. we want a location that looks like:
        # https://raw.githubusercontent.com/elifesciences/elife-article-xml/5f1179c24c9b8a8b700c5f5bf3543d16a32fbe2f/articles/elife-00003-v1.xml
        rc, rawsha = utils.run_script(["cat", "elife-article-xml.sha1"])
        ensure(rc == 0, "failed to read the contents of './elife-article-xml.sha1'")
        sha = rawsha.strip()
        fname = os.path.basename(path)
        return "https://raw.githubusercontent.com/elifesciences/elife-article-xml/%s/articles/%s" % (sha, fname)

    # who knows what this path is ...
    LOG.warn("scraping article content in a non-repeatable way. path %r not found in article-xml dir. please don't send the results to lax", path)
    return path
Exemplo n.º 3
0
def expand_location(path):
    # if isinstance(path, file):
    if is_file(path):
        path = path.name

    if path.startswith('https://s3-external-1.amazonaws.com/'
                       ) or path.startswith('https://s3.amazonaws.com/'):
        # it's being downloaded from a bucket, no worries
        return path

    # resolve any symlinks
    # the backfill uses symlinks to the article-xml dir
    path = os.path.abspath(os.path.realpath(path))

    if re.match(r".*article-xml/articles/.+\.xml$", path):
        # this article is coming from the local ./article-xml/ directory, which
        # is almost certainly a git checkout. we want a location that looks like:
        # https://raw.githubusercontent.com/elifesciences/elife-article-xml/5f1179c24c9b8a8b700c5f5bf3543d16a32fbe2f/articles/elife-00003-v1.xml
        rc, rawsha = utils.run_script(["cat", "elife-article-xml.sha1"])
        ensure(rc == 0,
               "failed to read the contents of './elife-article-xml.sha1'")
        sha = rawsha.strip()
        fname = os.path.basename(path)
        return "https://raw.githubusercontent.com/elifesciences/elife-article-xml/%s/articles/%s" % (
            sha, fname)

    # who knows what this path is ...
    LOG.warn(
        "scraping article content in a non-repeatable way. path %r not found in article-xml dir. please don't send the results to lax",
        path)
    return path
Exemplo n.º 4
0
 def add_connection(self, from_room, to_room):
     ensure(from_room >= 0 and from_room < len(self.nodes),
            'from_room out of bounds')
     ensure(to_room >= 0 and to_room < len(self.nodes),
            'to_room out of bounds')
     self.connections[from_room, to_room] = distance.euclidean(
         self.nodes[from_room].center, self.nodes[to_room].center)
Exemplo n.º 5
0
def expand_videos(msid, video):
    gc_data = metadata(msid) # cached on first hit
    gc_id_str = ", ".join(gc_data.keys())

    v_id = video['id']
    ensure(gc_data, "glencoe doesn't know %r, it doesn't have any media")
    ensure(v_id in gc_data, "glencoe doesn't know %r, only %r" % (v_id, gc_id_str))

    video_data = gc_data[v_id]
    video_data = utils.subdict(video_data, ['jpg_href', 'width', 'height'])
    video_data = utils.renkeys(video_data, [('jpg_href', 'image')])

    func = lambda mtype: OrderedDict([
        ('mediaType', SOURCES[mtype]),
        ('uri', gc_data[v_id][mtype + "_href"])
    ])
    video_data['sources'] = lmap(func, SOURCES)
    video.update(video_data)

    del video['uri'] # returned by elife-tools, not part of spec

    # Add placeholder, the video thumbnail image
    video["placeholder"] = OrderedDict()
    video["placeholder"]["uri"] = video["image"].split('/')[-1]
    video["placeholder"]["alt"] = ""

    return video
Exemplo n.º 6
0
    def triangulate(self):
        ensure(
            len(self.nodes) >= 4,
            'Cannot triangulate with less than four rooms')
        self.reset_connections()
        points = []
        self.lookup = []
        i = 0
        for id, n in enumerate(self.nodes):
            if n.is_main:
                points.append(n.center)
                self.lookup.append(id)
            i += 1

        tri = Delaunay(points)

        for simplex in tri.simplices:
            self.add_connection(self.lookup[simplex[0]],
                                self.lookup[simplex[1]])
            self.add_connection(self.lookup[simplex[1]],
                                self.lookup[simplex[2]])
            self.add_connection(self.lookup[simplex[2]],
                                self.lookup[simplex[0]])

        self.triangulation = tri
Exemplo n.º 7
0
def file_handler(path):
    ensure(
        path.startswith(PROJECT_DIR),
        "unsafe operation - refusing to read from a file location outside of project root. %r does not start with %r"
        % (path, PROJECT_DIR))
    xml = open(path, 'r').read()
    # write cache?
    return xml
Exemplo n.º 8
0
 def splitter(string):
     if isinstance(string, list):
         pair = string # already split into pairs, return what we have
         return pair
     ensure('|' in string, "override key and value must be seperated by a pipe '|'")
     first, rest = string.split('|', 1)
     ensure(rest.strip(), "a value must be provided. use 'null' without quotes to use an empty value")
     return first, rest
Exemplo n.º 9
0
def manual_overrides(ctx, data):
    "replace top-level article keys with new values provided in ctx.override"
    overrides = ctx.get('override', {})
    ensure(isinstance(overrides, dict), "given mapping of overrides is not a dictionary")
    # possibly add support for dotted paths in future?
    for key, value in overrides.items():
        data['article'][key] = value
    return data
Exemplo n.º 10
0
def test_divide():
    l1 = ['/', 2, 2]
    l2 = ['/', 30, 2, ['+', 1, 2]]

    # log(Apply().divide(l1))
    # log(Apply().divide(l2))
    ensure(Apply().divide(l1) == 1, 'divide 测试1')
    ensure(Apply().divide(l2) == 5, 'divide 测试2')
Exemplo n.º 11
0
def validate_response(response):
    utils.validate(response, conf.RESPONSE_SCHEMA)
    max_size_in_bytes = 262144
    response_size = len(serialise_response(response))
    ensure(
        response_size <= max_size_in_bytes,
        "response size (%s) is too large: %s" %
        (response_size, max_size_in_bytes), ValidationError)
Exemplo n.º 12
0
def manual_overrides(ctx, data):
    "replace top-level article keys with new values provided in ctx.override"
    overrides = ctx.get('override', {})
    ensure(isinstance(overrides, dict),
           "given mapping of overrides is not a dictionary")
    # possibly add support for dotted paths in future?
    for key, value in overrides.items():
        data['article'][key] = value
    return data
Exemplo n.º 13
0
def test_call_function():
    d1 = [['def', 'f1', ['a', 'b'], [['if', ['<', 'a', 0], 3, 'b']]]]
    apply = Apply()
    apply.apply_trees(d1)
    # print('func', self.func)
    l1 = ['call', 'f1', [1, 2]]
    # log('var', apply.var)
    # log('func', apply.func)
    # log('func result', apply.call_func(l1))
    ensure(apply.call_func(l1) == 2, 'call_function 测试1')
Exemplo n.º 14
0
def download(location):
    "download file, convert and pipe content straight into lax + transparent cache"
    ensure('://' in location[:10], 'no protocol found in %r, failing' % location)
    protocol, path = location.split('://')
    downloaderficationer = {
        'https': lambda: http_download(location),
        # load files relative to adaptor root
        'file': partial(file_handler, path)
    }
    file_contents = downloaderficationer[protocol]()
    return file_contents
Exemplo n.º 15
0
def download(location):
    "download file, convert and pipe content straight into lax + transparent cache"
    ensure('://' in location[:10],
           'no protocol found in %r, failing' % location)
    protocol, path = location.split('://')
    downloaderficationer = {
        'https': lambda: http_download(location),
        # load files relative to adaptor root
        'file': partial(file_handler, path)
    }
    file_contents = downloaderficationer[protocol]()
    return file_contents
Exemplo n.º 16
0
 def splitter(string):
     if isinstance(string, list):
         pair = string  # already split into pairs, return what we have
         return pair
     ensure('|' in string,
            "override key and value must be seperated by a pipe '|'")
     first, rest = string.split('|', 1)
     ensure(
         rest.strip(),
         "a value must be provided. use 'null' without quotes to use an empty value"
     )
     return first, rest
Exemplo n.º 17
0
def validate_gc_data(gc_data):
    # we've had one case like this
    ensure(gc_data != {}, "glencoe returned successfully, but response is empty")

    # we also can't guarantee all of the sources will always be present
    known_sources = SOURCES.keys()
    for v_id, v_data in gc_data.items():

        available_sources = lfilter(lambda mtype: mtype + "_href" in v_data, known_sources)

        # fail if we have partial data
        msg = "number of available sources less than known sources for %r. missing: %s" % \
            (v_id, ", ".join(set(known_sources) - set(available_sources)))
        ensure(len(available_sources) == len(known_sources), msg)
Exemplo n.º 18
0
def test_apply_trees():  # 这是最顶层的函数,传入抽象语法树
    l1 = [['+', 1, 2, ['-', 2, 1]]]
    l2 = [['-', 2, 2], ['-', 2, 1]]
    l3 = [['var', 'a', ['-', 2, 1]]]
    l4 = [['var', 'a', 1], ['var', 'b', ['+', 1, 1]],
          ['if', ['<', 'a', 0], 3, 'b']]
    l5 = [['var', 'a', 3], ['var', 'b', 2],
          ['def', 'f1', ['a', 'b'], [['-', ['+', 'a', 2], 3, 'b']]],
          ['call', 'f1', ['a', 'b']]]

    # ensure(Apply().apply_trees(l1) == 4, 'apply_trees 测试1')
    # ensure(Apply().apply_trees(l2) == 1, 'apply_trees 测试2')
    # ensure(Apply().apply_trees(l3) == 'N/A', 'apply_trees 测试3')
    # ensure(Apply().apply_trees(l4) == 2, 'apply_trees 测试4')
    ensure(Apply().apply_trees(l5) == 0, 'apply_trees 测试5')
Exemplo n.º 19
0
def validate_gc_data(gc_data):
    # we've had one case like this
    ensure(gc_data != {},
           "glencoe returned successfully, but response is empty")

    # we also can't guarantee all of the sources will always be present
    known_sources = SOURCES.keys()
    for v_id, v_data in gc_data.items():

        available_sources = lfilter(lambda mtype: mtype + "_href" in v_data,
                                    known_sources)

        # fail if we have partial data
        msg = "number of available sources less than known sources for %r. missing: %s" % \
            (v_id, ", ".join(set(known_sources) - set(available_sources)))
        assert len(available_sources) == len(known_sources), msg
Exemplo n.º 20
0
def test():
    # test case,
    arr1 = [2, 3, 1, 0, 2, 5, 3]
    arr2 = [5, 4, 3, 2, 1, 0]
    arr3 = [12, 3]
    arr4 = []
    ensure((True, 2) == duplicate(arr1), 'arr1')
    ensure((False,) == duplicate(arr2), 'arr2')
    ensure((False,) == duplicate(arr3), 'arr3')
    ensure((False,) == duplicate(arr4), 'arr4')
Exemplo n.º 21
0
 def random_edges(self, percentage):
     ensure(self.triangulation,
            'You must triangulate before creating random edges')
     s_count = len(self.triangulation.simplices)
     rnd_count = int(s_count * 3 * percentage)
     rnd = s_count * np.random.rand(rnd_count)
     rnd = rnd.astype(int)
     counter = 0
     cur_try = 0
     while counter < len(rnd) and cur_try < s_count:
         cur_try += 1
         rn = rnd[counter]
         s = self.triangulation.simplices[rn]
         ri = random.randint(0, 2)
         i = s[ri]
         j = s[ri + 1] if ri < 2 else s[0]
         if not self.is_connected(self.lookup[i], self.lookup[j]):
             self.add_connection(self.lookup[i], self.lookup[j])
             counter += 1
Exemplo n.º 22
0
 def serialize(pair):
     key, val = pair
     ensure(isinstance(key, basestring), "key must be a string")
     ensure('|' not in key, "key must not contain a pipe")
     key = key.strip()
     ensure(key, "key must not be empty")
     return '|'.join([key, json.dumps(val)])
Exemplo n.º 23
0
 def serialize(pair):
     key, val = pair
     ensure(isinstance(key, basestring), "key must be a string")
     ensure('|' not in key, "key must not contain a pipe")
     key = key.strip()
     ensure(key, "key must not be empty")
     return '|'.join([key, json.dumps(val)])
Exemplo n.º 24
0
def test():
    # test case,
    arr1 = [2, 3, 5, 4, 3, 2, 6, 7]
    arr2 = [5, 4, 3, 2, 1, 0]
    arr3 = [1, 2, 3, 4, 3]
    arr4 = []
    result1 = duplicate(arr1)
    ensure(result1[0] == True and result1[1] in (2, 3), 'arr1')
    result2 = duplicate(arr2)
    ensure(result2[0] == False, 'arr2')
    result3 = duplicate(arr3)
    ensure(result3[0] == True and result3[1] in (3, ), 'arr3')
    result4 = duplicate(arr4)
    ensure(result4[0] == False, 'arr4')
Exemplo n.º 25
0
def mkreq(path, **overrides):
    ensure(not path.startswith('http://'), "no insecure requests, please")

    # TODO: consider removing these two lines
    if path.lstrip('/').startswith('article-xml/articles/'):
        path = os.path.abspath(path)

    path = 'file://' + path if not path.startswith('https://') else path
    msid, ver = utils.version_from_path(path)
    request = {
        'action': conf.INGEST,
        'location': path,
        'id': msid,
        'version': ver,
        'force': True, # TODO: shouldn't this be False?
        'validate-only': False,
        'token': 'pants-party'
    }
    request.update(overrides)
    # don't ever generate an invalid request
    utils.validate(request, conf.REQUEST_SCHEMA)
    return request
Exemplo n.º 26
0
def test_greater_than():
    l1 = ['>', 2, 1]
    l2 = ['>', 1, 2]
    l3 = ['>', 1, ['+', 1, 1]]

    ensure(Apply().greater_than(l1), 'more 测试1')
    ensure(not Apply().greater_than(l2), 'more 测试2')
    ensure(not Apply().greater_than(l3), 'more 测试3')
Exemplo n.º 27
0
def test_apply():
    l1 = ['+', 1, 2, ['-', 2, 1]]
    l2 = ['if', ['>', 1, 2], 1, 2]
    l3 = ['if', ['<', 1, 2], 1, 2]
    l4 = ['if', ['=', 1, 2], 1, 2]

    # print(apply(l1))
    # print(apply(l2))
    # print(apply(l3))
    # print(apply(l4))

    ensure(Apply().apply(l1) == 4, 'apply 测试1')
    ensure(Apply().apply(l2) == 2, 'apply 测试2')
    ensure(Apply().apply(l3) == 1, 'apply 测试3')
    ensure(Apply().apply(l4) == 2, 'apply 测试4')
Exemplo n.º 28
0
def test_less_than():
    l1 = ['<', 2, 1]
    l2 = ['<', 1, 2]
    l3 = ['<', 1, ['+', 1, 1]]

    ensure(not Apply().less_than(l1), 'less 测试1')
    ensure(Apply().less_than(l2), 'less 测试2')
    ensure(Apply().less_than(l3), 'less 测试3')
Exemplo n.º 29
0
def test_judge_cmp():
    l1 = ['if', ['>', 3, 4], 1, 2]
    l2 = ['if', False, 2, ['+', 1, 2]]
    l3 = ['if', False, 2, ['if', True, 1, 2]]

    ensure(Apply().judge(l1) == 2, 'judge 测试1')
    ensure(Apply().judge(l2) == 3, 'judge 测试2')
    ensure(Apply().judge(l3) == 1, 'judge 测试3')
Exemplo n.º 30
0
def test_equal():
    l1 = ['=', 2, 1]
    l2 = ['=', 2, 2]
    l3 = ['=', 2, ['+', 1, 1]]

    ensure(not Apply().equal(l1), 'equal 测试1')
    ensure(Apply().equal(l2), 'equal 测试2')
    ensure(Apply().equal(l3), 'equal 测试3')
Exemplo n.º 31
0
def _subdesc(desc, path):
    """a path looks like: <type>.<target>
    for example: `file./opt/thing/` or `mysql-database.mydb1"""
    bits = path.split('.', 1)
    ensure(len(bits) == 2, "expecting just two bits, got %s bits: %s" % (len(bits), path), ValueError)
    toplevel, target = bits
    ensure(toplevel in desc, "descriptor has no %r key" % toplevel, ValueError)
    ensure(target in desc[toplevel], "given descriptor has no path %r" % path, ValueError)
    return {toplevel: [target]}
Exemplo n.º 32
0
def test():
    # test case,
    # 此方法有缺陷,不能找出所有的重复的数字,
    # 因为在1~2的范围里有1和2两个数字,这个范围的数字也出现两次,
    # 不能确定是每个数字各出现一次还是某个数字出现了两次
    arr1 = [2, 3, 5, 4, 3, 2, 6, 7]
    arr2 = [5, 4, 3, 2, 1, 0]
    arr3 = [1, 2, 3, 4, 4]
    arr4 = []
    result1 = duplicate(arr1)
    ensure(result1[0] == True and result1[1] in (2, 3), 'arr1')
    result2 = duplicate(arr2)
    ensure(result2[0] == False, 'arr2')
    result3 = duplicate(arr3)
    ensure(result3[0] == True and result3[1] in (4, ), 'arr3')
    result4 = duplicate(arr4)
    ensure(result4[0] == False, 'arr4')
Exemplo n.º 33
0
def make_inf_list(params,steps, make_string):
    l=[make_string(day,steps) for day in params['days']]
    return l

# specify params for synth data 
# params are also needed to construct inference prog / ripl
params_name = 'easy_d4_s33_bi4_be10'
#featurefunctions__maxDay_size_num_birds_softmaxbeta

params = get_params(params_name, 'onebird')
gtruth_params  = params.copy()
infer_params = params.copy()
gtruth_params['name'] = 'gtruth'
infer_params['name'] = 'infer'

ensure(params_name)
directory = params_name +'/'

generate_save_synthetic_data(gtruth_params,directory)


## specifying an inference prog:
# need list of inference strings or single one (filter/batch)
# for list, need num_days from params

exp_seed1=dict( type = 'seq_block_mh', 
                steps = 50,
                make_inf_string = seq_block_mh_make_inf_string)
exp_seed2 = exp_seed1.copy()
exp_seed3=dict( type = 'seq_block_pgibbs', 
                steps = 50,
Exemplo n.º 34
0
def find_lax():
    ensure(os.path.exists(conf.PATH_TO_LAX), "could not find lax")
    script = join(conf.PATH_TO_LAX, "manage.sh")
    ensure(os.path.exists(script), "could not find lax's manage.sh script")
    return script
Exemplo n.º 35
0
 def is_connected(self, from_room, to_room):
     ensure(from_room >= 0 and from_room < len(self.nodes),
            'from_room out of bounds')
     ensure(to_room >= 0 and to_room < len(self.nodes),
            'to_room out of bounds')
     return self.connections[from_room, to_room] != 0
    except BaseException as err:
        strbuffer.write("failed (%s)" % err)
    finally:
        log = conf.multiprocess_log('generation.log', __name__)
        log.info(strbuffer.getvalue())

def main(xml_dir, json_output_dir, num=None):
    paths = lmap(lambda fname: join(xml_dir, fname), os.listdir(xml_dir))
    paths = lfilter(lambda path: path.lower().endswith('.xml'), paths)
    paths = sorted(paths, reverse=True)
    if num:
        paths = paths[:num] # only scrape first n articles
    num_processes = -1
    Parallel(n_jobs=num_processes)(delayed(render)(path, json_output_dir) for path in paths)
    print('see scrape.log for errors')

if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('xml-dir', nargs='?', default=conf.XML_DIR)
    parser.add_argument('output-dir', nargs='?', default=conf.JSON_DIR)
    parser.add_argument('--num', type=int, nargs='?')

    args = vars(parser.parse_args())
    indir, outdir = [os.path.abspath(args[key]) for key in ['xml-dir', 'output-dir']]

    ensure(os.path.exists(indir), "the path %r doesn't exist" % indir)
    ensure(os.path.exists(outdir), "the path %r doesn't exist" % outdir)

    main(indir, outdir, args['num'])
Exemplo n.º 37
0
def test_11():
    ensure(applyCompare(['>', 2, 1]) == True, 'test_11  >')
Exemplo n.º 38
0
 def wrap(v):
     ensure(v, "%s cannot be blank/empty/None" % label)
     return v
Exemplo n.º 39
0
def test_12():
    arr = ['+', 1, 2, 3]
    ensure(apply_ops(['>', 2, 1]) == True, 'test_12  >')
    ensure(apply_ops(arr) == 6, 'test_12  +')
Exemplo n.º 40
0
 def wrap(v):
     ensure(v, "%s cannot be blank/empty/None" % label)
     return v
Exemplo n.º 41
0
def find_lax():
    ensure(os.path.exists(conf.PATH_TO_LAX), "could not find lax")
    script = join(conf.PATH_TO_LAX, "manage.sh")
    ensure(os.path.exists(script), "could not find lax's manage.sh script")
    return script
Exemplo n.º 42
0
def file_handler(path):
    ensure(path.startswith(PROJECT_DIR),
           "unsafe operation - refusing to read from a file location outside of project root. %r does not start with %r" % (path, PROJECT_DIR))
    xml = open(path, 'r').read()
    # write cache?
    return xml
Exemplo n.º 43
0
def test():
    arr1 = [2, 3, 1, 0, 2, 5, 3]
    ensure((True, 2) == duplicate(arr1), 'arr1')
Exemplo n.º 44
0
def validate_response(response):
    utils.validate(response, conf.RESPONSE_SCHEMA)
    max_size_in_bytes = 262144
    response_size = len(serialise_response(response))
    ensure(response_size <= max_size_in_bytes, "response size (%s) is too large: %s" % (response_size, max_size_in_bytes), ValidationError)