class NerService(tornado.web.RequestHandler): def initialize(self, concepts_inlinks, stopwords, entities, inlinks_threshold=400, MAX_WORDS=400, MAX_CHARS=2000): """ """ self.MAX_WORDS = MAX_WORDS self.MAX_CHARS = MAX_CHARS self.ner = Ner(concepts_inlinks, entities, stopwords, inlinks_threshold=inlinks_threshold, max_words=MAX_WORDS) def get(self): # Get parameters inlinks_threshold = int(self.get_argument("inlinks_threshold", default=400)) self.ner.inlinks_threshold=inlinks_threshold text = self.get_argument("text") debug = self.get_argument("debug", default=False) # Check warnings if exists warning = [] if len(text) > self.MAX_CHARS: warning.append('Only the first %d chars will be processed. This request is over this limit.' % self.MAX_CHARS) if len(text.split(' ')) > self.MAX_WORDS: warning.append('Only the first %d words will be processed. This request is over this limit.' % self.MAX_WORDS) #result = json.loads(self.ner.fetch_entities(text)) result = self.ner.fetch_entities(text) # Erase text at response del(result['text']) # if exists warning, append the flags to the output if len(warning) > 0: result['warnings'] = warning if debug: self.write(result) else: self.write({"concepts": list(result["results"].keys())}) def post(self): results = list() for line in str(self.request.body, 'utf8').split('\n'): if line: fields = line.split('\t') text = fields[0] #concepts = self.__format_post_result(self.ner.fetch_entities(text)) concepts = self.ner.fetch_entities(text) print(concepts['results']) #result_fields = [concepts] + fields #result = '\t'.join(result_fields) concept_names = list(concepts['results'].keys()) results.append({"text":text, "concepts":concept_names}) #for temp in results: # self.write('%s\n' % (temp)) print(results) self.write({"response":results}) def __format_post_result(self, response): response_dict = json.loads(response) concepts = response_dict['results'].keys() if concepts: return ";;".join(concepts) else: return ""
def initialize(self, concepts_inlinks, stopwords, inlinks_threshold=400, MAX_WORDS=400, MAX_CHARS=2000): """ """ self.MAX_WORDS = MAX_WORDS self.MAX_CHARS = MAX_CHARS self.inlinks_threshold = inlinks_threshold self.ner = Ner(concepts_inlinks, stopwords, inlinks_threshold=inlinks_threshold, max_words=MAX_WORDS)
def initializeNer(): concepts_inlinks = {} stopwords = set() entities = set() logging.info("Loading concepts...") with (open('../services/concept_service/data/pagelinks_all.tsv', encoding='utf-8', errors='ignore')) as concepts_file: for concept in concepts_file.readlines(): parts = concept.split('\t') concepts_inlinks[parts[0]] = parts[1] logging.info("%s concepts loaded." % len(concepts_inlinks)) logging.info("Loading stopwords...") with (open('../services/concept_service/data/stopwords.txt', encoding='utf-8', errors='ignore')) as sw_file: for sw in sw_file: stopwords.add(sw.replace('\n', '').lower()) logging.info("%s stopwords loaded." % len(stopwords)) return Ner(concepts_inlinks, entities, stopwords, inlinks_threshold=400, max_words=500)
def main(): # for docker docker.json if DOCKER_ENV in os.environ: cfg_path = './config/docker.json' else: cfg_path = './config/dev.json' # 1. config_processor = ConfigProcessor(cfg_path) cfg = config_processor.get_configs() print(cfg) global STORAGE_PATH STORAGE_PATH = cfg['local_storage_path'] # 2. Ner ner_model = Ner('./ml/devlabs_ner_ontonotes_bert.json') #ret = ner_model.handle_file('/home/neurus/Projects/rvision-hackathon-2021-q1/converted/2020/11-2020-Chaes-e-commerce-malware-research.pdf.txt') #print(ret) # 3. RabbitMq amqp_processor = AmqpProcessor(cfg['rabbit_mq']) # 4. DB pgdb = PostgreSqlDatabase(cfg) # 5. Start to listen incoming messages try: channel = amqp_processor.establish_connection( create_callback(ner_model, pgdb)) channel.start_consuming() except: pgdb.close() amqp_processor.close_connection()
def initialize(self, concepts_inlinks, stopwords, entities, inlinks_threshold=400, MAX_WORDS=400, MAX_CHARS=2000): """ """ self.MAX_WORDS = MAX_WORDS self.MAX_CHARS = MAX_CHARS self.ner = Ner(concepts_inlinks, entities, stopwords, inlinks_threshold=inlinks_threshold, max_words=MAX_WORDS)
default=3, help= 'size of synthesized data. size of total data = (augment_size + 1) * the original data size' ) parser.add_argument('--seed', type=int, default=42, help='random seed') if __name__ == '__main__': args = parser.parse_args() file_name = args.file_name dedup = args.dedup augment_size = args.augment_size seed = args.seed ner = Ner(ner_dir_name='.', ignore_tag_list=['O'], data_augument_tag_list=data_augument_tag_list, augment_size=augment_size, seed=seed, dedup=dedup) aug_samples, aug_sample_tags = ner.augment(file_name='%s.txt' % file_name) if dedup: target_dir = 'aug_dedup_%d' % augment_size else: target_dir = 'aug_%d' % augment_size if not os.path.exists(target_dir): os.makedirs(target_dir) with open(os.path.join(target_dir, '%s.txt' % file_name), 'w', encoding='utf-8') as f: for tokens, tags in zip(aug_samples, aug_sample_tags):
class NerService(tornado.web.RequestHandler): def initialize(self, concepts_inlinks, stopwords, inlinks_threshold=400, MAX_WORDS=400, MAX_CHARS=2000): """ """ self.MAX_WORDS = MAX_WORDS self.MAX_CHARS = MAX_CHARS self.inlinks_threshold = inlinks_threshold self.ner = Ner(concepts_inlinks, stopwords, inlinks_threshold=inlinks_threshold, max_words=MAX_WORDS) def get(self): # Get parameters inlinks_threshold = int( self.get_argument("inlinks_threshold", default=self.inlinks_threshold)) self.ner.inlinks_threshold = inlinks_threshold text = self.get_argument("text") debug = self.get_argument("debug", default=False) # Check warnings if exists warning = [] if len(text) > self.MAX_CHARS: warning.append( 'Only the first %d chars will be processed. This request is over this limit.' % self.MAX_CHARS) if len(text.split(' ')) > self.MAX_WORDS: warning.append( 'Only the first %d words will be processed. This request is over this limit.' % self.MAX_WORDS) result = self.ner.fetch_entities(text) # Erase text at response del (result['text']) # if exists warning, append the flags to the output if len(warning) > 0: result['warnings'] = warning if debug: self.write(result) else: self.write({"concepts": list(result["results"].keys())}) def post(self): results = list() for line in str(self.request.body, 'utf8').split('\n'): if line: fields = line.split('\t') text = fields[0] concepts = self.ner.fetch_entities(text) concept_names = list(concepts['results'].keys()) results.append({"text": text, "concepts": concept_names}) self.write({"response": results}) def __format_post_result(self, response): response_dict = json.loads(response) concepts = response_dict['results'].keys() if concepts: return ";;".join(concepts) else: return ""
def setUp(self): # self.ner = BertNer() #Your implementation of NER self.input_file = 'ner_test_input.txt' self.output_file = 'ner_test_output.txt' self.pred = Ner.predict(self.input_file)