def install(self): # Check if package installed db = hpakDB(self.pkg_name) if db.get_value("status") == "installed": misc.print_error("%s - already installed!" % (self.pkg_name), False) return self.prepare_install() dl = download(self.options['source'], self.pkg_path, self.pkg_name) dl.get() # Extracting the file. e = Extractor(self.options) e.extract() # Install depends self.install_dep() Cmds = self.options['install'].split(',') for cmd in Cmds: subprocess.Popen(cmd, shell=True).wait() # Verify package installed. if os.path.exists("%s/%s" % (HPAK_ROOT, self.options['dir'])): db = hpakDB(self.pkg_name) db.set_value("status", "installed") misc.print_success("%s installed." % (self.pkg_name)) else: misc.print_error("%s-%s NOT installed, please try again." % (self.pkg_name, self.options['version']), True)
def process(self,file): feats = {} Extractor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) pos = cqpf.getColumn(1) # initialize counts for name in self.posnames: feats[name] = 0 for i in range(2,len(pos)): # ignore first two pos ... uni = (pos[i])[0:3] bi = (pos[i-1])[0:3] + "_" + uni tri = (pos[i-2])[0:3] + "_" + bi if uni in feats: feats[uni] += 1 if bi in feats: feats[bi] += 1 if tri in feats: feats[tri] += 1 for x in self.posnames: feats[x] /= float(len(pos)-2) return ir.getID(),feats
class ExtractorTest(unittest.TestCase): def setUp(self): example = "LOQ75625Team LOQ 49% blend std 8mm 21-JUN-2000 12:55:23 30.0" self.extractor = Extractor() self.extractor.extract_data(example) def tearDown(self): pass def test_extract_valid_instrument_name(self): self.assertEqual("LOQ", self.extractor.instrumentname) def test_extract_valid_run_number(self): self.assertEqual("75625", self.extractor.runnumber) def test_extract_valid_run_username(self): self.assertEqual("Team LOQ", self.extractor.username) def test_extract_valid_run_experimenttitle(self): self.assertEqual("49% blend std 8mm", self.extractor.experimenttitle) def test_extract_valid_run_startdate(self): self.assertEqual("21-JUN-2000", self.extractor.startdate) def test_extract_valid_run_starttime(self): self.assertEqual("12:55:23", self.extractor.starttime) def test_extract_valid_run_charge(self): self.assertEqual("30.0", self.extractor.charge)
class Framework: """""" def __init__(self, classifier): """""" self.classifier = classifier self.extractor = Extractor() def _create_subjects(self): """""" return [Subject(FILE_NAMES['NormROIS'] % (file_index + 1)) for file_index in range(NUM_OF_SUBJECTS)] def _train(self, classifier, features): """""" classifier.train(features) def _classify(self, classifier): """""" classifier.classify() def execute(self): """""" # 1) Load the data files subjects = self._create_subjects() # 2) Extract the features self.extractor.extract_features(subjects) print len(self.extractor.features['P']), exit() # 3) Train the classifier self._train(self.classifier, self.extractor.features) # 4) Classify some data self._classify(self.classifier)
def test_cond(self): from masks import mask e = Extractor() logging.debug(e) e.add_feature_condition(mask) res = e.extract(self.data) self.assertTrue(len(res[self.data.keys()[0]]) > 0)
def generateFeatures(self): ''' Has been hardcoded for wikipedia For each category, fetch Wiki-pages from list.txt Store keywords (links in the specified section)in features.txt ''' e = Extractor() print self.categories for name in self.categories: print name f = open("%s/%s/%s" % (self.config.get(self.section, "CLASSES_FILE"), name, self.config.get(self.section, "LIST_FILE")), "r") g = open("%s/%s/%s" % (self.config.get(self.section, "CLASSES_FILE"), name, self.config.get(self.section, "FEATURE_FILE")), "w") for page in f: print page pagetok = page.strip().split('\t') try: section = pagetok[1] except: section = 0 links = e.getWikiLinks(pagetok[0], section = section) for feature in links: units = set(self.clean(feature).split('_')) for unit in units: unit = self.stemmer.stem(unit) if self.valid(unit): g.write("%s," % unit) g.write("\n") f.close() g.close()
def __init__(self): self.__featureNames = sorted( [name for (name, re) in self.DIRECT_FEATS] + [name for (name, re) in self.LEMMA_FEATS] + self.CALCULATED_FEATS ) Extractor.__init__(self)
def test_extractorResultGetData(self): strategy = mock.MagicMock() strategy.get_data.return_value = {"success": True} extractor = Extractor(strategy) result = extractor.get_result() self.assertTrue(result.get_data()["success"])
def ext_json(): rdfUrl = '' tok = Tokenizer() if request.method == 'POST': rdf = request.form['data'] status_test = "0"#request.form['status'] filters = ""#request.form['exculdeurls'] #rdf = "http://jpp.no-ip.org/MAD_J.rdf" try: #r = requests.get(rdf) gg = Graph() #g.load(rdfUrl) rdf_content = StringIO.StringIO(rdf.encode('utf-8')) #print rdf_content.readline() gg.parse(rdf_content, format="xml") ext = Extractor(gg) uris = ext.getUris() mapping = MapFactory() for uri in uris: term = tok.tokenized_url(uri) uri_status = "" if status_test == "1": uri_status = ext.testUri(uri) else: uri_status = "N/A" uri_lookup = str(uri)+"\"" lnum = ext.get_lines(rdf_content, uri_lookup) ent = MapEntry(uri, term, "", lnum, uri_status) mapping.add(ent) jsonized_result = json.dumps(mapping.get()) return Response(jsonized_result, mimetype='application/json') except requests.exceptions.ConnectionError: X2Rwarning = 'X2R Warning: The requested URL raises ConnectionError~!!!' return X2Rwarning
def test_extractorResultGetJson(self): strategy = mock.MagicMock() strategy.get_data.return_value = {"success": True} extractor = Extractor(strategy) result = extractor.get_result() self.assertEqual('{"success": true}', result.get_json())
def predict(article_link, image_link): """ output: predicted emotion as: [ 0. 1. 0. 0. 0.] """ e = Extractor() user_input = { "article_link": article_link, "image_link": image_link } friendly_json = e.user_extract(user_input) tax_list = friendly_json['alchemy']['taxonomy'] tax_primary = [] for t in tax_list: tax_primary.append(t['label'].split('/')[1]) tax_primary = list(set(tax_primary))[0] extracted_articles = dict() extracted_articles['articles'] = [friendly_json] textEmotions = text_emotions_x(extracted_articles) picEmotions = picture_emotions_x(extracted_articles) with open('emotionClassification/trained_models/bbac_1150_all_clf.pkl','r') as f: clf = cPickle.load(f) test_article = makeDataMatrix(textEmotions, picEmotions) reaction = predictReactions(clf, test_article) return reaction[0], tax_primary
def process(self,file): feats = {} Extractor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) #words = ' '.join(cqpf.getColumn(0)) #pos = ' '.join(self.disambiguatePOS(cqpf.getColumn(1))) lemma = cqpf.getColumn(2) sentences = cqpf.getAnnotations("s") wordpostmp = [] for (start,end,attr) in sentences: wordpostmp.append('<s>') wordpostmp.extend(self.getWordsWithPOS( cqpf.getColumn(0)[start:end], self.disambiguatePOS(cqpf.getColumn(1)[start:end]))) wordpostmp.append('</s> ') wordpos = ' '.join(wordpostmp) feats.update(self.extractWithREs(self.DIRECT_FEATS,wordpos)) feats.update(self.extractWithREs(self.CALC_FEATS,wordpos)) feats.update(self.extractFromLemmatatizedForms(self.LEMMA_FEATS,lemma)) self.calculateFeats(feats) self.normalizeByLength(feats, len(lemma)) feats.update(self.extractStatistics(cqpf)) print feats return ir.getID(),feats
def add_synset(self, word): ex = Extractor() word_id = md5.md5(word).hexdigest() if not self.fdb.get(word_id): self.fdb.set(ROOT + word_id, word) synset = ex.getWikiBacklinks(word) if synset: for synonym in synset: self.fdb.set(SYN + synonym.upper(), word_id)
def test_monotony(self): from masks import absolute_monotony as monotony e = Extractor() logging.debug(e) e.add_feature_condition(monotony.Raising) e.add_feature_condition(monotony.Falling) res = e.extract(self.data) logging.debug("res: \n%s", pprint.pformat(res)) self.assertTrue(len(res[self.data.keys()[0]]) > 0)
def process(self,file): feats = {} Extractor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) lengths = [end-start for (start,end,arg) in cqpf.getAnnotations("s")] print self.__featureNames feats = utils.getStats("SENT_LENGTH", lengths) return ir.getID(),feats
def add_disambiguation(self, a): ex = Extractor() ls = ex.getDisambiguationLinks(a + '_(disambiguation)') if ls: anode = self.graphdb.get_or_create_indexed_node(self.DISAMBIGUATION, 'name', a, {'name': a, 'class': self.DISAMBIGUATION}) for l in ls: print "disambiguation link:", l lnode = self.graphdb.get_indexed_node('NODE', 'name', l) if lnode: print "creating disamb relation betn", a, ", ", l self.graphdb.create((anode, self.DISAMBIGUATION, lnode, {'class': self.DISAMBIGUATION, 'weight': 1}))
def extract(args, task, crawler_list): """runs every crawler in crawler_list""" # init and start feature extractor logging.debug("Extracting") feature_extractor = Extractor() # FEATURE LIST: cli_mask_groups = parse_arg_range(args.extraction_masks, type_=str) all_masks = get_all_masks(cli_mask_groups + task.mask_groups) feature_extractor.add_feature_masks(all_masks) # data storage paths extractor_stream(task, feature_extractor, crawler_list) logging.debug("done extracting")
def __init__(self, filename): super(Database, self).__init__() Extractor.extract(filename) lines = '' with open(filename, 'r') as f: lines = f.readlines() self.courses = list() for i in lines: x = i.split(',') x = [y.strip('()"') for y in x] self.courses.append(Course(x[0],x[1],x[2],x[3],x[4],x[5],x[6]))
class Teacher: def __init__(self): self.model = SongModel() self.extractor = Extractor() def parse_set(self): content = [] with open("training/Tracks/ground_truth.csv") as f: for l in f: l = l.replace('\"', '').replace('\n', '') name = "" genre = "" flag = 0 for c in l: if c == ',': flag = 1 elif flag == 0: name += c elif flag == 1: genre += c content.append([name, genre]) return content def train(self): for item in self.parse_set(): self.extractor.set_song(item[0]) tempo = self.extractor.get_tempo() rolloffmoy = self.extractor.get_rolloff_moy() rolloffect = self.extractor.get_rolloff_ect() zcrmoy = self.extractor.get_zcr_moy() zcrect = self.extractor.get_zcr_ect() duration = self.extractor.get_duration() self.model.add(item[0], item[1], tempo, rolloffmoy, rolloffect, zcrmoy, zcrect, duration) print("ADDED : " + item[0] + " " + item[1] + " " + str(tempo) + " " + str(rolloffmoy) + " " + str(rolloffect) + " " + str(zcrmoy) + " " + str(zcrect) + " " + str(duration)) print("DONE")
def __init__(self, parent): Frame.__init__(self, parent) self.parent = parent self.music_root = '' self.query_path = '' self.extractor = Extractor(n_frames=40, n_blocks=100, learning_rate=0.00053, verbose=True) self.style = Style() self.style.theme_use("default") padx = 2 pady = 2 root_select_button = Button(self, text="Select a directory") root_select_button.pack(fill=tkinter.X, padx=padx, pady=pady) root_select_button.bind("<Button-1>", self.set_music_root) analyze_button = Button(self, text="Analyze") analyze_button.pack(fill=tkinter.X, padx=padx, pady=pady) analyze_button.bind("<Button-1>", self.analyze) query_select_button = Button(self, text="Select a file") query_select_button.pack(fill=tkinter.X, padx=padx, pady=pady) query_select_button.bind("<Button-1>", self.set_query_path) search_button = Button(self, text="Search similar songs") search_button.pack(fill=tkinter.X, padx=padx, pady=pady) search_button.bind("<Button-1>", self.search_music) self.pack(fill=BOTH, expand=1)
def __init__(self): self.extractor = Extractor() self.sqs = boto3.client('sqs') self.queue_url = 'https://sqs.ap-southeast-1.amazonaws.com/841662669278/crawler' self.s3 = boto3.client('s3') self.dynamodb = boto3.resource('dynamodb') self.bloom_filter = MyBloomFilter(self.dynamodb.Table('link'))
class IndexTrainer(object): def __init__(self): self.index = InvertedIndex() self.bow = Bow() self.extractor = Extractor('surf') print self.index.author print self.index.description def load_feature(self, path='../models/feature.npy'): self.features = np.load(path) if len(self.features) > 200000: self.features = self.features[:200000] print "feature shape: ", self.features.shape return self.features def run(self, path): self.bow.load() self.index.reset(self.bow.centers) images = imutil.get_list_image(path) t = imutil.Timer(1) t.tic() for i,image in enumerate(images): descriptors = self.extractor.extract(image) self.index.append(image, descriptors) if (i+1)%1000 == 0: t.toc('finish 1000 images: ') t.tic()
def extract_multi(args): # Extract Featues from Net using prot_file from images from input file # it will save patch of max_value features at one .cPickle. # Not as one file, because 2.4M images produce 10 GB of data pred = Extractor(args.proto_path,args.bin_path) max_value = 512 curr_value = 0 list_all_result = list() list_good_class_all = list() list_name_file = list() create_dir(args.folder) with open(args.images,'r') as file_image: list_images = list() list_good_class = list() for idx,line in enumerate(file_image): splitted = line.split(' ') list_good_class.append(int(splitted[1])) list_images.append(splitted[0].strip()) curr_value = curr_value + 1 if curr_value < max_value: continue else: #predict using value predictions = pred.predict_multi(list_images) f = Feature(predictions,list_good_class) name = '/'.join((args.folder,str(idx)+"_file.cPickle")) list_name_file.append(os.path.abspath(name)) save_cPickle(f,name) list_good_class = list() list_images = list() curr_value = 0 print "Predicted 512" #predict last package of data, which is smaller than max_value if len(list_images) > 0: predictions = pred.predict_multi(list_images) list_all_result.append(predictions) f = Feature(predictions,list_good_class) name = '/'.join((args.folder,str(idx)+"_file.cPickle")) save_cPickle(f,name) list_name_file.append(os.path.abspath(name)) f = open(args.folder+ '/' + 'files.txt', 'wb') f.writelines( "%s\n" % item for item in list_name_file) f.close()
def getWikiDist(self, a, b): a = a.replace(' ', '_') b = b.replace(' ', '_') e = Extractor() sa = e.getWikiBacklinks(a, filter = "nonredirects") sb = e.getWikiBacklinks(b, filter = "nonredirects") n1 = log(max(len(sa), len(sb))) n2 = log(len(set.intersection(sa, sb))) d1 = log(10 ** 7) d2 = log(min(len(sa), len(sb))) extra1 = extra2 = 0 #if a in sb: extra1 = log(10 ** 7 / len(sb)) #if b in sa: extra2 = log(10 ** 7 / len(sa)) try: return (n1 - n2) / float(d1 - d2) except ZeroDivisionError as e: print e return self.INF
class Extraktor(object): def __init__(self): self.extractor = Extractor() self.sqs = boto3.client('sqs') self.queue_url = 'https://sqs.ap-southeast-1.amazonaws.com/841662669278/crawler' self.s3 = boto3.client('s3') self.dynamodb = boto3.resource('dynamodb') self.bloom_filter = MyBloomFilter(self.dynamodb.Table('link')) def process(self): while True: ret = self.sqs.receive_message( QueueUrl=self.queue_url, MaxNumberOfMessages=10, WaitTimeSeconds=1 ) if 'Messages' not in ret: continue for msg in ret['Messages']: key = msg['Body'] record = self.s3.get_object(Bucket='samuel-html', Key=key) #pack['Body'] botocore.response.StreamingBody pack = json.loads(lzo.decompress(record['Body'].read()).decode('utf-8')) # response = self.client.delete_message( # QueueUrl=self.queue_url, # ReceiptHandle=msg['ReceiptHandle'] # ) # print(response) self.bloom_filter.add(pack['url']) if pack.get('code') == 200: url = pack['url'] ret = self.extractor.extract(pack) for link in ret['links']: if not self.bloom_filter.add(link['url']): seed(link) else: #print 'already crawled', link['url'] pass #save pack to tbl_link self.dynamodb.Table('link').put_item( Item = { 'url': url, 'ctime': Decimal(str(time.time())), 'utime': Decimal(str(time.time())) } ) logger.info("%s ok" % (pack['url'])) else: logger.warn("%s not ok code:%d" % (pack['url'], pack.get('code'))) response = self.sqs.delete_message( QueueUrl=self.queue_url, ReceiptHandle=msg['ReceiptHandle'] )
def extract_comments(self): if self.has_soup(): comments = self.soup.find_all("div", class_="comment") or [] for comment in comments: extractor = Extractor(comment) author = extractor.extract_comment_author_user_name() post_url = "" #this needs to be set with the post in scope date = extractor.extract_comment_date() score = extractor.extract_comment_score() body = extractor.extract_comment_body() self.comments.append(Comment( author=author, post_url=post_url, date=date, score=score, body=body )) return self.comments
def __init__(self, song): self.song = song self.model = SongModel() self.extractor = Extractor() self.tempo = 0 self.rolloffmoy = 0.0 self.rolloffect = 0.0 self.zcrmoy = 0.0 self.zcrect = 0.0 self.duration = 0.0 self.genre = [] for l in open("training/Tracks/genres.txt"): self.genre.append(l.replace('\n',''))
def ext_result(): rdfUrl = '' if request.method == 'POST': rdfUrl = request.form['url'] try: r = requests.get(rdfUrl) #rdfUrl = str(r.status_code) g = Graph() #g.parse("MAD.rdf", format="xml") g.load(rdfUrl) ext = Extractor(g) uris = ext.getUris() terms = ext.terms() result = {} result['uris'] = uris result['terms'] = terms result['bNodes'] =str(len(ext.getBnodes())) result['uNodes'] = str(len(uris)) return render_template('index.html', result= result) except requests.exceptions.ConnectionError: X2Rwarning = 'X2R Warning: The requested URL raises ConnectionError~!!!' return X2Rwarning
def setUp(self): # open and parse model xml file self.fmodel = open("logs_new/model_log.xml") self.model_tree = ET.parse(self.fmodel) self.model_root = self.model_tree.getroot() # create sample experiment data example = "LOQ75625Team LOQ 49% blend std 8mm 21-JUN-2000 12:55:23 30.0" self.extractor = Extractor() self.extractor.extract_data(example) self.maxDiff = None self.xml_out = XMLOutputter("testlog") self.xml_out.write_line(self.extractor)
def main(): downloader = Downloader() extractor = Extractor() url = "https://pornhub.com" puts(colored.green("getting video keys.")) main_page = downloader.get(url) view_keys = extractor.get_viewkeys(main_page) puts(colored.green("starting to download videos.")) for key in view_keys: puts(colored.green("getting video information.")) absolute_url = "https://pornhub.com/view_video.php?viewkey=" + key page = downloader.get(absolute_url) info = extractor.get_video_info(page) if info is None: continue hd_quality = info['mediaDefinitions'][0] puts(colored.green("downloading video %s." % info['video_title'])) downloader.save_file(hd_quality["videoUrl"], info['video_title'] + ".mp4")
from extractor import Extractor import LeaveMessage import re from getTime import * from getType import * from doAskForLeave import * from stanfordcorenlp import StanfordCoreNLP from getReason import get_reason ex = Extractor() # def get_type(sentence): # affairs = re.search(r'(.*)事(.*)假(.*).*', sentence, re.M | re.I) # sick = re.search(r'(.*)病(.*)假(.*).*', sentence, re.M | re.I) # marriage = re.search(r'(.*)婚(.*)假(.*).*', sentence, re.M | re.I) # if affairs: # return "事假" # elif sick: # return "病假" # elif marriage: # return "婚假" # else: # return None def ask(message): if message.startDate is None and message.endDate is None and message.duration is None and message.type is None and message.examinePerson is None and message.email is None and message.reason is None: return "请输入请假时间等信息" if message.type is None: return "请输入请假类型"
sys.exit(0) yto_config = yto(sys.argv[1]) # Set defaults. seq_length = yto_config.videoSeqLength class_limit = None # Number of classes to extract. Can be 1-101 or None for all. # Get the dataset. data = DataSet(seq_length=seq_length, class_limit=class_limit, repo_dir=yto_config.repoDir, feature_file_path=yto_config.featureFileName, work_dir=yto_config.workDir) # get the model. model = Extractor() # Loop through data. pbar = tqdm(total=len(data.data)) sequence_path = os.path.join(yto_config.workDir, 'sequences') if not os.path.exists(sequence_path): print("Creating sequence folder [%s]", sequence_path) os.makedirs(sequence_path) for video in data.data: # Get the path to the sequence for this video. path = os.path.join(sequence_path, video[2] + '-' + str(seq_length) + \ '-features') # numpy will auto-append .npy # Check if we already have it. if os.path.isfile(path + '.npy'):
def __init__(self): self.preprocessor = Preprocessor() self.extractor = Extractor() self.normalizer = Normalizer()
def Extrair(): print('Informe o Tempo de Processamento (em segudos): ', end='') tempo = float(input()) extracao = Extractor( tempo ) # Faz conexão com a API do Twitter e extrai dados para banco de dados local.
class InteractivePredictor: exit_keywords = ['exit', 'quit', 'q'] def __init__(self, config, model): model.predict([]) self.model = model self.config = config self.path_extractor = Extractor(config, EXTRACTION_API, self.config.MAX_PATH_LENGTH, max_path_width=2) @staticmethod def read_file(input_filename): with open(input_filename, 'r') as file: return file.readlines() def predict(self): input_filename = 'Input.java' print('Serving') while True: print('Modify the file: "' + input_filename + '" and press any key when ready, or "q" / "exit" to exit') user_input = input() if user_input.lower() in self.exit_keywords: print('Exiting...') return user_input = ' '.join(self.read_file(input_filename)) try: predict_lines, pc_info_dict = self.path_extractor.extract_paths( user_input) except ValueError: continue model_results = self.model.predict(predict_lines) prediction_results = Common.parse_results(model_results, pc_info_dict, topk=SHOW_TOP_CONTEXTS) for index, method_prediction in prediction_results.items(): print('Original name:\t' + method_prediction.original_name) if self.config.BEAM_WIDTH == 0: print('Predicted:\t%s' % [ step.prediction for step in method_prediction.predictions ]) for timestep, single_timestep_prediction in enumerate( method_prediction.predictions): print('Attention:') print( 'TIMESTEP: %d\t: %s' % (timestep, single_timestep_prediction.prediction)) for attention_obj in single_timestep_prediction.attention_paths: print('%f\tcontext: %s,%s,%s' % (attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) else: print('Predicted:') for predicted_seq in method_prediction.predictions: print('\t%s' % predicted_seq.prediction)
def __init__(self): self.__extractor = Extractor() self.__tokeniser = Tokeniser() self.__tagger = Tagger() self.__dataset = Dataset() self.__logger = Logger()
def __init__(self): self.Users = Users() self.extractor = Extractor() self.list_User = self.extractor.extractorUsers
class InteractivePredictor: exit_keywords = ['exit', 'quit', 'q'] def __init__(self, config, model): model.predict([]) self.model = model self.config = config self.path_extractor = Extractor(config, jar_path=JAR_PATH, max_path_length=MAX_PATH_LENGTH, max_path_width=MAX_PATH_WIDTH) def read_file(self, input_filename): with open(input_filename, 'r') as file: return file.readlines() def predict(self): input_filename = 'Input.java' print('Starting interactive prediction...') while True: print( 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) user_input = input() if user_input.lower() in self.exit_keywords: print('Exiting...') return try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename) except ValueError as e: print(e) continue results, code_vectors = self.model.predict(predict_lines) prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS) for i, method_prediction in enumerate(prediction_results): print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) print('Attention:') for attention_obj in method_prediction.attention_paths: print('%f\tcontext: %s,%s,%s' % (attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) if self.config.EXPORT_CODE_VECTORS: print('Code vector:') print(' '.join(map(str, code_vectors[i]))) def dn_predict(self): # input_filename = 'Input.java' # input_filename = input() print('Starting interactive prediction...') data_list = glob.glob("data/in_use/*/*.java") for input_filename in data_list: # while True: # print( # 'Modify the file: "%s" and press any key when ready, or "q" / "quit" / "exit" to exit' % input_filename) # user_input = input() # input_filename = input() # if user_input.lower() in self.exit_keywords: print(input_filename) if input_filename.lower() in self.exit_keywords: print('Exiting...') return try: predict_lines, hash_to_string_dict = self.path_extractor.extract_paths( input_filename) except ValueError as e: print(e) continue results, code_vectors = self.model.predict(predict_lines) prediction_results = common.parse_results(results, hash_to_string_dict, topk=SHOW_TOP_CONTEXTS) for i, method_prediction in enumerate(prediction_results): print('Original name:\t' + method_prediction.original_name) for name_prob_pair in method_prediction.predictions: print('\t(%f) predicted: %s' % (name_prob_pair['probability'], name_prob_pair['name'])) print('Attention:') for attention_obj in method_prediction.attention_paths: print('%f\tcontext: %s,%s,%s' % (attention_obj['score'], attention_obj['token1'], attention_obj['path'], attention_obj['token2'])) if self.config.EXPORT_CODE_VECTORS: print('Code vector:') print(' '.join(map(str, code_vectors[i]))) with open('jms_output.txt', 'a') as f_out: f_out.write("{}\t{}\n".format( input_filename, ', '.join(map(str, code_vectors[i]))))
import sys import cv2 import numpy as np from data import DataSet from extractor import Extractor from keras.models import load_model from insert1 import insert_img from Connector_mysql import connect import winsound import re import math import io print("Loading Model .......") saved_LSTM_model = load_model( "data\\checkpoints\\lstm-features.022-0.035.hdf5", compile='False') extract_model = Extractor(image_shape=(320, 240, 3)) print( "****************************Model Ready.......***************************" ) def video(video_file): #print('time take to load imports {:0.3f}'.format(time.time() - start)) start = time.time() '''print(sys.argv) if (len(sys.argv) == 2): #seq_length = int(sys.argv[1]) #class_limit = int(sys.argv[2]) #saved_model = sys.argv[3] #video_file = sys.argv[1] else:
Formats the given file. """ if not file_util.is_missing_or_empty_file(raw_output_path): xml = etree.parse(raw_output_path, etree.XMLParser(recover=True)) sections = [] # Extract the title. title_nodes = xml.xpath(title_xpath) sections.append("".join( [x.text.replace("\n", " ").strip() for x in title_nodes])) # Extract the lines. section_nodes = xml.xpath(sections_xpath) for node in section_nodes: line_nodes = node.xpath(line_xpath) sections.append("\n".join([ x.text.replace("\n", " ").strip() for x in line_nodes if x is not None and x.text is not None ])) return "\n\n".join(sections) return "" if __name__ == "__main__": arg_parser = Extractor.get_argument_parser() args = arg_parser.parse_args() PdfExtractExtractor(args).process()
def __init__(self, config_fp, language="es"): Extractor.__init__(self, config_fp) self.language = language
def __init__(self, config_fp, language="en"): Extractor.__init__(self, config_fp) # should be modified self.language = language
def __init__(self): super().__init__() self.particles = "e" self.req_particles = None self.pu_mode = None self.req_pu_mode = None self.req_kickers_mode = False self.state = 'idle' self.ic_runmode = 'idle' self.linStarter = LinStarter() self.extractor = Extractor() self.modeCtl = ModesClient() self.pu_ctl = PUSwitcher() self.mode_subsys = [37, 38, 39] self.modeCtl.markedReady.connect(self.kickers_loaded) self.linStarter.runDone.connect(self.next_state) self.extractor.extractionDone.connect(self.next_state) self.pu_ctl.switching_done.connect(self.next_state) self.timer = cda.Timer() self.calibr_timer = cda.Timer() self.states = [ self.__idle, self.__preinject, self.__inject2, self.__injected, self.__preextract, self.__extract2, self.__extracted, self.__pu_switching, self.__pu_switched ] # output channels self.c_state = cda.StrChan('cxhw:0.ddm.state', on_update=True, max_nelems=20) self.c_stateMsg = cda.StrChan('cxhw:0.ddm.stateMsg', on_update=True, max_nelems=100) self.c_icrunmode = cda.StrChan('cxhw:0.ddm.ICRunMode', on_update=True, max_nelems=20) # command channels self.cmds = [ 'stop', 'inject', 'extract', 'nround', 'autorun', 'e2v4', 'p2v4', 'e2v2', 'p2v2' ] self.c_cmds = [ cda.IChan('cxhw:0.ddm.' + x, on_update=True) for x in self.cmds ] for c in self.c_cmds: c.valueMeasured.connect(self.cmd_proc) # option-command channels self.c_particles = cda.StrChan('cxhw:0.ddm.particles', on_update=True, max_nelems=20) self.c_particles.valueMeasured.connect(self.particles_update) self.c_particles.setValue(self.particles) self.c_extr_train = cda.IChan('cxhw:0.ddm.extr_train', on_update=True) self.c_extr_train.valueMeasured.connect(self.train_proc) self.c_extr_train_interval = cda.DChan( 'cxhw:0.ddm.extr_train_interval', on_update=True) self.c_extr_train_interval.valueMeasured.connect( self.train_interval_update) # event channels self.c_injected = cda.IChan('cxhw:0.ddm.injected', on_update=True) self.c_extracted = cda.IChan('cxhw:0.ddm.extracted', on_update=True) # beam current channels self.c_beamcur = cda.DChan('cxhw:0.dcct.beamcurrent', on_update=True) self.c_extr_beamCur = cda.DChan('cxhw:0.dcct.ExtractionCurrent', on_update=True) self.c_v2k_auto = cda.IChan('cxhw:0.ddm.v2k_auto', on_update=True) self.c_v2k_particles = cda.StrChan('cxhw:0.bep.particles', on_update=True, max_nelems=20) self.c_v2k_particles.valueMeasured.connect(self.v2k_auto_mode) self.c_v2k_offline = cda.IChan('cxhw:0.bep.offline', on_update=True) self.c_v2k_offline.valueMeasured.connect(self.v2k_offline_proc) self.linbeam_cor = LinBeamCtl()
class Annotator(): __job_position_tag = "EMP-POS" __job_company_tag = "EMP-COMP" __education_course_tag = "EDU-MAJOR" __education_institution_tag = "EDU-INST" def __init__(self): self.__extractor = Extractor() self.__tokeniser = Tokeniser() self.__tagger = Tagger() self.__dataset = Dataset() self.__logger = Logger() def prepare_dataset(self, nr_of_docs=-1): resumes, labels = self.__extractor.read_raw_files(nr_of_docs) resumes = self.__tokeniser.tokenise_docs_to_lines(resumes) resumes = self.__tokeniser.tokenise_doclines_to_words(resumes) self.__dataset.resume_content = self.annotate_docs(resumes, labels) self.__dataset.save() # resumes: list of tokenised (by line and word) résumé docs # labels: xml structure storing labels for several resumes def annotate_docs(self, resumes, labels): self.__logger.println("annotating resumes") annotated_resumes = [] for idx, resume in enumerate(resumes): annotated_resumes.append(self.annotate_doc(resume, labels[idx])) self.__logger.println( "annotating resume %s/%s with true labels and pos tags" % (idx + 1, len(resumes))) # non local ner tag entire dataset at a time for speed annotated_resumes = self.__tagger.nonlocal_ner_tag(annotated_resumes) self.__logger.println("completed annotating resumes") return annotated_resumes # doc: a single résumé document with token strings in each slot of list # labels: xml structure storing pre-extracted information def annotate_doc(self, doc, labels): job_title_list = self.__extractor.get_job_titles(labels) job_company_list = self.__extractor.get_company_names(labels) edu_major_list = self.__extractor.get_edu_majors(labels) edu_inst_list = self.__extractor.get_edu_institutions(labels) # can extract more labels here prepared_doc = self.__tagger.prepare_doc(doc) prepared_doc = self.__match_entity(prepared_doc, job_title_list, self.__job_position_tag) prepared_doc = self.__match_entity(prepared_doc, job_company_list, self.__job_company_tag) prepared_doc = self.__match_entity(prepared_doc, edu_major_list, self.__education_course_tag) prepared_doc = self.__match_entity(prepared_doc, edu_inst_list, self.__education_institution_tag) prepared_doc = self.__tagger.add_default_entity_tags(prepared_doc) prepared_doc = self.__tagger.pos_tag(prepared_doc) return prepared_doc # doc: résumé doc to be annotated # entity_list: list of labels to matched in doc # tag: tag to be assigned if match found def __match_entity(self, doc, entity_list, tag): for entity in entity_list: doc = self.__tagger.match_label(doc, entity, tag) return doc # function takes in a path to file and annotates it for tagging # to be ideally used to tag as a one off for testing # filepath: path to résumé def annotate_using_trained_model(self, filepath): resume_content = self.__extractor.read_resume_content(filepath) resume_content = self.__tokeniser.tokenise_docs_to_lines( resume_content) resume_content = self.__tokeniser.tokenise_doclines_to_words( resume_content) prepared_doc = self.__tagger.prepare_doc(resume_content[0]) prepared_doc = self.__tagger.pos_tag(prepared_doc) prepared_doc = self.__tagger.nonlocal_ner_tag([prepared_doc]) return prepared_doc[0]
class InjExtLoop: def __init__(self): super().__init__() self.particles = "e" self.req_particles = None self.pu_mode = None self.req_pu_mode = None self.req_kickers_mode = False self.state = 'idle' self.ic_runmode = 'idle' self.linStarter = LinStarter() self.extractor = Extractor() self.modeCtl = ModesClient() self.pu_ctl = PUSwitcher() self.mode_subsys = [37, 38, 39] self.modeCtl.markedReady.connect(self.kickers_loaded) self.linStarter.runDone.connect(self.next_state) self.extractor.extractionDone.connect(self.next_state) self.pu_ctl.switching_done.connect(self.next_state) self.timer = cda.Timer() self.calibr_timer = cda.Timer() self.states = [ self.__idle, self.__preinject, self.__inject2, self.__injected, self.__preextract, self.__extract2, self.__extracted, self.__pu_switching, self.__pu_switched ] # output channels self.c_state = cda.StrChan('cxhw:0.ddm.state', on_update=True, max_nelems=20) self.c_stateMsg = cda.StrChan('cxhw:0.ddm.stateMsg', on_update=True, max_nelems=100) self.c_icrunmode = cda.StrChan('cxhw:0.ddm.ICRunMode', on_update=True, max_nelems=20) # command channels self.cmds = [ 'stop', 'inject', 'extract', 'nround', 'autorun', 'e2v4', 'p2v4', 'e2v2', 'p2v2' ] self.c_cmds = [ cda.IChan('cxhw:0.ddm.' + x, on_update=True) for x in self.cmds ] for c in self.c_cmds: c.valueMeasured.connect(self.cmd_proc) # option-command channels self.c_particles = cda.StrChan('cxhw:0.ddm.particles', on_update=True, max_nelems=20) self.c_particles.valueMeasured.connect(self.particles_update) self.c_particles.setValue(self.particles) self.c_extr_train = cda.IChan('cxhw:0.ddm.extr_train', on_update=True) self.c_extr_train.valueMeasured.connect(self.train_proc) self.c_extr_train_interval = cda.DChan( 'cxhw:0.ddm.extr_train_interval', on_update=True) self.c_extr_train_interval.valueMeasured.connect( self.train_interval_update) # event channels self.c_injected = cda.IChan('cxhw:0.ddm.injected', on_update=True) self.c_extracted = cda.IChan('cxhw:0.ddm.extracted', on_update=True) # beam current channels self.c_beamcur = cda.DChan('cxhw:0.dcct.beamcurrent', on_update=True) self.c_extr_beamCur = cda.DChan('cxhw:0.dcct.ExtractionCurrent', on_update=True) self.c_v2k_auto = cda.IChan('cxhw:0.ddm.v2k_auto', on_update=True) self.c_v2k_particles = cda.StrChan('cxhw:0.bep.particles', on_update=True, max_nelems=20) self.c_v2k_particles.valueMeasured.connect(self.v2k_auto_mode) self.c_v2k_offline = cda.IChan('cxhw:0.bep.offline', on_update=True) self.c_v2k_offline.valueMeasured.connect(self.v2k_offline_proc) self.linbeam_cor = LinBeamCtl() def v2k_offline_proc(self, chan): if self.c_v2k_auto.val == 0 or self.pu_mode not in {'e2v2', 'p2v2'}: return if self.c_v2k_offline.val == 1: self.linbeam_cor.close_beam() elif self.c_v2k_offline.val == 0: self.linbeam_cor.open_beam() def v2k_auto_mode(self, chan): if self.c_v2k_auto.val == 0 or self.req_pu_mode is not None: return if chan.val == 'positrons' and self.pu_mode == 'e2v2': self.p2v2() if chan.val == 'electrons' and self.pu_mode == 'p2v2': self.e2v2() def train_interval_update(self, chan): if chan.val > 0: self.extractor.set_training_interval(chan.val) else: chan.setValue(self.extractor.training_interval) def train_proc(self, chan): if chan.val and self.ic_runmode == 'idle': self.extractor.start_training() def particles_update(self, chan): if self.req_pu_mode is not None: return if self.particles == chan.val or chan.val not in {'e', 'p'}: return if self.ic_runmode == 'idle': self.set_particles(chan.val) else: self.req_particles = chan.val def set_particles(self, p): if self.particles == p: return self.particles = p self.linStarter.set_particles(self.particles) if self.c_particles.val != p: self.c_particles.setValue(p) def set_pu_mode(self, mode): if self.pu_mode == mode: return self.req_pu_mode = mode if self.ic_runmode == 'idle': self.run_state('pu_switching') def kickers_loaded(self): if self.req_kickers_mode: self.timer.singleShot(80, self.next_state) self.req_kickers_mode = False def run_state(self, state=None): if state is not None: self.state = state self.c_state.setValue(self.state) if self.ic_runmode == 'idle': return s_ind = state_names.index(self.state) self.c_stateMsg.setValue(stateMsg[s_ind]) self.states[s_ind]() def next_state(self): s_ind = state_names.index(self.state) ns_ind = s_ind + 1 if ns_ind < len(state_names): self.state = state_names[ns_ind] self.run_state() def __idle(self): pass def __preinject(self): if self.req_particles is not None: self.set_particles(self.req_particles) self.req_particles = None if self.req_pu_mode is not None: self.run_state('pu_switching') return self.req_kickers_mode = True self.modeCtl.load_marked(self.particles + 'inj', self.mode_subsys, ['rw']) def __inject2(self): self.linStarter.start() def __injected(self): self.c_injected.setValue(1) if self.ic_runmode in {"single-cycle", "auto-cycle"}: self.next_state() def __preextract(self): self.req_kickers_mode = True self.modeCtl.load_marked(self.particles + 'ext', self.mode_subsys, ['rw']) def __extract2(self): self.c_extr_beamCur.setValue(self.c_beamcur.val) self.extractor.extract() def __extracted(self): self.c_extracted.setValue(1) if self.ic_runmode == "auto-cycle": self.state = "preinject" self.run_state() def __pu_switching(self): if self.req_pu_mode is None: print('mode not requested') return self.modeCtl.load_marked(self.req_pu_mode, [7]) self.set_particles(self.req_pu_mode[0]) self.pu_ctl.switch_mode(self.req_pu_mode) def __pu_switched(self): self.pu_mode = self.req_pu_mode self.req_pu_mode = None if self.ic_runmode == "auto-cycle": self.run_state("preinject") else: self.run_state('idle') def cmd_proc(self, chan): if chan.first_cycle: return sn = chan.short_name() getattr(self, sn)() def set_runmode(self, runmode): self.ic_runmode = runmode self.c_icrunmode.setValue(runmode) def stop(self): self.linStarter.stop() self.extractor.stop() self.set_runmode('idle') self.run_state('idle') def inject(self): self.set_runmode("single-action") self.run_state('preinject') def extract(self): # check if something injected self.set_runmode("single-action") self.run_state('preextract') def nround(self): self.set_runmode("single-cycle") self.run_state('preinject') def autorun(self): self.set_runmode("auto-cycle") self.run_state('preinject') def e2v4(self): self.set_pu_mode('e2v4') def p2v4(self): self.set_pu_mode('p2v4') def e2v2(self): self.set_pu_mode('e2v2') def p2v2(self): self.set_pu_mode('p2v2')
def __init__(self, config, model): model.predict([]) self.model = model self.config = config self.path_extractor = Extractor(config, EXTRACTION_API, self.config.MAX_PATH_LENGTH, max_path_width=2)
def main(): parser = argparse.ArgumentParser("PyTorch Face Recognizer") parser.add_argument('cmd', type=str, choices=['train', 'test', 'extract'], help='train, test or extract') parser.add_argument('--arch_type', type=str, default='resnet50_ft', help='model type', choices=[ 'resnet50_ft', 'senet50_ft', 'resnet50_scratch', 'senet50_scratch' ]) parser.add_argument('--dataset_dir', type=str, default='/path/to/dataset_directory', help='dataset directory') # parser.add_argument('--log_file', type=str, default='/path/to/log_file', help='log file') # parser.add_argument('--train_img_list_file', type=str, default='/path/to/train_image_list.txt', # help='text file containing image files used for training') # parser.add_argument('--test_img_list_file', type=str, default='/path/to/test_image_list.txt', # help='text file containing image files used for validation, test or feature extraction') # parser.add_argument('--meta_file', type=str, default='/path/to/identity_meta.csv', help='meta file') # parser.add_argument('--checkpoint_dir', type=str, default='/path/to/checkpoint_directory', # help='checkpoints directory') parser.add_argument('--feature_dir', type=str, default='/path/to/feature_directory', help='directory where extracted features are saved') # parser.add_argument('-c', '--config', type=int, default=1, choices=configurations.keys(), # help='the number of settings and hyperparameters used in training') # parser.add_argument('--batch_size', type=int, default=32, help='batch size') # parser.add_argument('--resume', type=str, default='', help='checkpoint file') parser.add_argument('--weight_file', type=str, default='/path/to/weight_file.pkl', help='weight file') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)') # parser.add_argument('--horizontal_flip', action='store_true', # help='horizontally flip images specified in test_img_list_file') args = parser.parse_args() print(args) if args.cmd == "extract": utils.create_dir(args.feature_dir) if args.cmd == 'train': utils.create_dir(args.checkpoint_dir) cfg = configurations[args.config] log_file = args.log_file # resume = args.resume os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) cuda = torch.cuda.is_available() if cuda: print("torch.backends.cudnn.version: {}".format( torch.backends.cudnn.version())) torch.manual_seed(1337) if cuda: torch.cuda.manual_seed(1337) # 0. id label map # meta_file = args.meta_file # id_label_dict = utils.get_id_label_map(meta_file) # 1. data loader root = args.dataset_dir # train_img_list_file = args.train_img_list_file # test_img_list_file = args.test_img_list_file kwargs = {'num_workers': args.workers, 'pin_memory': True} if cuda else {} # if args.cmd == 'train': # dt = datasets.VGG_Faces2(root, train_img_list_file, id_label_dict, split='train') # train_loader = torch.utils.data.DataLoader(dt, batch_size=args.batch_size, shuffle=True, **kwargs) dv = datasets.VGG_Faces2(root, test_img_list_file, id_label_dict, split='valid', horizontal_flip=args.horizontal_flip) val_loader = torch.utils.data.DataLoader(dv, batch_size=args.batch_size, shuffle=False, **kwargs) # 2. model include_top = True if args.cmd != 'extract' else False if 'resnet' in args.arch_type: model = ResNet.resnet50(num_classes=N_IDENTITY, include_top=include_top) else: model = SENet.senet50(num_classes=N_IDENTITY, include_top=include_top) # print(model) start_epoch = 0 start_iteration = 0 utils.load_state_dict(model, args.weight_file) # if resume: # checkpoint = torch.load(resume) # model.load_state_dict(checkpoint['model_state_dict']) # start_epoch = checkpoint['epoch'] # start_iteration = checkpoint['iteration'] # assert checkpoint['arch'] == args.arch_type # print("Resume from epoch: {}, iteration: {}".format(start_epoch, start_iteration)) # else: # utils.load_state_dict(model, args.weight_file) # if args.cmd == 'train': # model.fc.reset_parameters() if cuda: model = model.cuda() criterion = nn.CrossEntropyLoss() if cuda: criterion = criterion.cuda() extractor = Extractor( cuda=cuda, model=model, val_loader=val_loader, log_file=log_file, feature_dir=args.feature_dir, flatten_feature=True, print_freq=1, ) extractor.extract()
def extract_data(cls, file): """"Uses extraction method from Extractor class""" ext = Extractor() ext.set_file(file) return ext.get_component_dictionary()
from gensim.models import KeyedVectors from extractor import Extractor import pandas as pd import sys if __name__ == '__main__': assert len( sys.argv ) == 5, "Need trained word2vec path / dataset path / product id (-1 to work with all) / max ngrams per tfidf" w2v_path = sys.argv[1] w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True) extractor = Extractor() extractor.word2vec = w2v df_path = sys.argv[2] df = pd.read_csv(df_path) products = list(set(df["PRODUCT"])) products.sort() product_id = int(sys.argv[3]) def shor_product(product_id): max_ngram_per_tfidf = int(sys.argv[4]) index = df["PRODUCT"] == product_id texts = list(df.loc[index, "TEXT"]) + \ list(df.loc[index, "BENEFITS"]) + \ list(df.loc[index, "DRAWBACKS"]) texts = list(map(str, filter(bool, texts))) extracted = extractor.transform(texts, 1, 4, max_ngram_per_tfidf) print(product_id) print(extracted) if product_id == -1: