cdr_path = data_path + 'cdr/' bc_path = data_path + 'bc/' embeddings_file = data_path + 'embeddings/glove.6B/glove.6B.100d.txt' set_labels = { 'cdr': ['Chemical', 'Disease'], 'bc': ['Chemical', 'Gene'], 'weak': [ 'Disease', 'Chemical', 'Species', 'Gene', 'ProteinMutation', 'DNAMutation', 'SNP' ] } dp = DataProcessor(set_labels=set_labels, vocab=embeddings_file, window_size=window_size) dp.read_file(cdr_path + 'ner_CID_Training_mine_PubTator.txt', 'cdr_train_weak', 'weak', update=True) dp.read_file(cdr_path + 'ner_CDR_TrainingSet.PubTator.txt', 'cdr_train', 'cdr', update=True) dp.read_file(cdr_path + 'ner_CDR_DevelopmentSet.PubTator.txt', 'cdr_dev', 'cdr') dp.read_file(cdr_path + 'ner_CDR_TestSet.PubTator.txt', 'cdr_test', 'cdr')
# -*- coding: utf-8 -*- import redis from flask import Flask, jsonify import config from process import DataProcessor app = Flask(__name__) dp = DataProcessor() client = redis.StrictRedis(host=config.redis_host, port=config.redis_port, decode_responses=True, charset='utf-8') @app.route('/') def index(): return 'OK' @app.route('/<protocol>/random') def random(protocol): """ 随机返回一个优质代理IP :param protocol: :return: """ return client.srandmember('{}:proxies:{}'.format(protocol, 1))
from utils import get_sidereal_time from process import open_fits, flatten_max, DataProcessor import dateutil.parser import os, shutil dp = DataProcessor() dp.outdir = 'test/out' dp.verbose = 1 st = '0429' path_end = os.path.join(st[0:2], st[2:4]) path = os.path.join('sid', path_end) night = os.listdir(path) dp.do_total = True dp.indir = 'sid' dp.do_filter = False dp.do_output = False dp.process_night(path, night) from django.template import Context, Template t = Template(open(os.path.join('clouds','templates','clouds','image.html')).read()) from catlib import parse_cat point_list = map(lambda (i,row):row, parse_cat(os.path.join('test', 'out', 'cat', path, 'total.cat')).iterrows()) print len(point_list) with open(os.path.join('test',st+'.html'), 'w') as out: out.write(t.render(Context({'point_list': point_list, 'point_pk': -1, 'object': {'get_url': path+'/total' }
""" Usage: parser_cli.py [options] INPUT_FILEPATH Options: -h --help --language LANGUAGE Language """ import json from docopt import docopt from tree_sitter import Language from language_data import LANGUAGE_METADATA from process import DataProcessor if __name__ == '__main__': args = docopt(__doc__) DataProcessor.PARSER.set_language( Language('/src/build/py-tree-sitter-languages.so', args['--language'])) processor = DataProcessor(language=args['--language'], language_parser=LANGUAGE_METADATA[ args['--language']]['language_parser']) functions = processor.process_single_file(args['INPUT_FILEPATH']) print(json.dumps(functions, indent=2))
def process(target): DataProcessor.PARSER.set_language(Language('/src/build/py-tree-sitter-languages.so', sys.argv[1])) processor = DataProcessor( language=sys.argv[1], language_parser=LANGUAGE_METADATA[sys.argv[1]]['language_parser'] ) results = [] if target['language'] == 'java': try: javalang.parse.parse(target['the_code']) except Exception as ex: if sys.argv[2] != 'gz': print('Failed to validate: ' + target['from_file']) print(target['the_code']) print(ex) return False, [] elif target['language'] == 'python': try: parser = driver.Driver(pygram.python_grammar, convert=pytree.convert) parser.parse_string(target['the_code'].strip() + '\n') ast.parse(target['the_code']) except Exception: if sys.argv[2] != 'gz': print('Failed to validate: ' + target['from_file']) return False, [] functions = processor.process_blob(target['the_code']) for function in functions: sha256 = hashlib.sha256( function["function"].strip().encode('utf-8') ).hexdigest() if target['language'] == 'java': if JAVA_REJECT_REGEX.search(function["function"]): continue if sha256 in BANNED_JAVA_SHAS: # print(" - Skipped '{}'".format(sha256)) continue # Spoon transformer chokes on these, so exclude elif target['language'] == 'python': if PY_REJECT_REGEX.search(function["function"]): continue if sha256 in BANNED_PY_SHAS: # print(" - Skipped '{}'".format(sha256)) continue # Spoon transformer chokes on these, so exclude tokens_pre, tokens_post = ([], []) try: tokens_pre, tokens_post = remove_func_name( function["identifier"].split('.')[-1], function["function_tokens"] ) except: continue results.append({ "language": function["language"], "identifier": function["identifier"].split('.')[-1], "target_tokens": subtokenize(function["identifier"].split('.')[-1]), "source_tokens": tokens_post, "elided_tokens": tokens_pre, "source_code": function["function"] if function["language"] != "java" else ( 'class WRAPPER {\n' + function["function"] + '\n}\n' ), "sha256_hash": sha256, "split": target['split'], "from_file": target['from_file'] }) return True, results
definitions = defaultdict(list) with open(args['DEFINITION_FILE'], 'rb') as f: for d in pickle.load(f) definitions[d['nwo']].append(d) definitions = dict(definitions) # Fill candidates from most depended libraries c = Counter(dees) library_candidates = {} for nwo, _ in c.most_common(len(c)): if nwo.split('/')[-1] not in library_candidates and nwo in definitions: # Approximate library name with the repository name from nwo library_candidates[nwo.split('/')[-1]] = definitions[nwo] DataProcessor.PARSER.set_language(Language(args['--tree-sitter-build'], args['--language'])) processor = DataProcessor(language=args['--language'], language_parser=LANGUAGE_METADATA[args['--language']]['language_parser']) with Pool(processes=int(args['--processes'])) as pool: output = pool.imap_unordered(functools.partial(processor.process_dent, ext=LANGUAGE_METADATA[args['--language']]['ext']), dents) dent_definitions, edges = map(list, map(flatten, zip(*output))) with gzip.GzipFile(args['OUTPUT_DIR'] + '{}_dent_definitions.pkl.gz'.format(args['--language']), 'wb') as outfile: pickle.dump(dent_definitions, outfile) with gzip.GzipFile(args['OUTPUT_DIR'] + '{}_edges.pkl.gz'.format(args['--language']), 'wb') as outfile: pickle.dump(edges, outfile)
hidden_dropout = 0.75 input_dropout = 0.5 middle_droupout = 1.0 word_droupout = 0.75 clip_norm = 5 batch_size = 32 ################ # process data # ################ embeddings_file = '/home/nathan/Programming/research/data/embeddings/glove.6B/glove.6B.100d.txt' dp = DataProcessor(vocab=embeddings_file) dp.read_file( '/home/nathan/Programming/research/data/cdr/ner_CDR_train.txt', '/home/nathan/Programming/research/sandbox/protos/cdr_train.proto', 'cdr', update=True) # dp.read_file('/home/nathan/Programming/research/data/cdr/ner_CDR_test.txt', # '/home/nathan/Programming/research/sandbox/protos/cdr_test.proto', # 'cdr') # dp.read_file('/home/nathan/Programming/research/data/cdr/ner_CDR_dev.txt', # '/home/nathan/Programming/research/sandbox/protos/cdr_dev.proto', # 'cdr')
from utils import get_sidereal_time from process import open_fits, flatten_max, DataProcessor import dateutil.parser import os, shutil dp = DataProcessor() dp.outdir = 'test/out' dp.verbose = 1 #date_obs = '2011-05-25T06:00:10' date_obs = '2012-02-29T10:37:12' name = date_obs + '.fits' path = os.path.join('sym', name[0:4], name[5:7], name[8:10]) dp.process_file(os.path.join(path, name)) """ dt = dateutil.parser.parse(name.split('.')[0]) s = get_sidereal_time(dt).seconds path_end = os.path.join(*[ unicode(x).zfill(2) for x in [ s/3600, (s/60)%60 ] ]) fname = os.path.join('out', 'fits', 'sid', path_end, 'total.fits') tdata = open_fits(fname) night = os.listdir(os.path.join('sid', path_end)) for i in [100, 250, 500, 1000, 3000, 4000, 5000, 2000]: dp.output('total', tdata, image_filter=flatten_max(i*len(night))) shutil.copyfile(os.path.join('test','out','png','total.png'), os.path.join('test', 'total{0}.png').format(i)) """
set_labels = { 'A': [ 'T005', 'T007', 'T037', 'T038', 'T058', 'T074', 'T092', 'T098', 'T168', 'T170' ], 'B': ['T017', 'T031', 'T062', 'T082', 'T091', 'T097', 'T103', 'T201', 'T204'], 'full': [ 'T005', 'T007', 'T037', 'T038', 'T058', 'T074', 'T092', 'T098', 'T168', 'T170', 'T017', 'T031', 'T062', 'T082', 'T091', 'T097', 'T103', 'T201', 'T204' ] } dp = DataProcessor(set_labels=set_labels, vocab=embeddings_file, window_size=window_size) dp.read_file(path + 'train_split_A_modified', 'A_train', 'A', update=True) dp.read_file(path + 'train_split_B_modified', 'B_train', 'B', update=True) dp.read_file(path + 'ner_dev', 'dev', 'full') dp.read_file(path + 'ner_test', 'test', 'full') ############### # build model # ############### vocab_size = len(dp.token_map) labels_size = len(dp.label_map) shape_domain_size = len(dp.shape_map)