Exemplo n.º 1
0
class NerService(tornado.web.RequestHandler):
        
    def initialize(self, concepts_inlinks, stopwords, entities, inlinks_threshold=400, MAX_WORDS=400, MAX_CHARS=2000):
        """
        """
        self.MAX_WORDS = MAX_WORDS
        self.MAX_CHARS = MAX_CHARS
        self.ner = Ner(concepts_inlinks, entities, stopwords, inlinks_threshold=inlinks_threshold, max_words=MAX_WORDS)
        
    def get(self):
        # Get parameters
        inlinks_threshold = int(self.get_argument("inlinks_threshold", default=400))
        self.ner.inlinks_threshold=inlinks_threshold
        text = self.get_argument("text")
        debug = self.get_argument("debug", default=False)
        # Check warnings if exists
        warning = []
        if len(text) > self.MAX_CHARS:
            warning.append('Only the first %d chars will be processed. This request is over this limit.' % self.MAX_CHARS)
        if  len(text.split(' ')) > self.MAX_WORDS:
            warning.append('Only the first %d words will be processed. This request is over this limit.' % self.MAX_WORDS)
        #result = json.loads(self.ner.fetch_entities(text))
        result = self.ner.fetch_entities(text)
        # Erase text at response
        del(result['text'])
        # if exists warning, append the flags to the output
        if len(warning) > 0:
            result['warnings'] = warning
        if debug:
            self.write(result)
        else:
            self.write({"concepts": list(result["results"].keys())})

    def post(self):
        results = list()
        for line in str(self.request.body, 'utf8').split('\n'):
            if line: 
                fields = line.split('\t')
                text = fields[0]
                #concepts = self.__format_post_result(self.ner.fetch_entities(text))
                concepts = self.ner.fetch_entities(text)
                print(concepts['results'])
                #result_fields = [concepts] + fields
                #result = '\t'.join(result_fields)
                concept_names = list(concepts['results'].keys())
                results.append({"text":text, "concepts":concept_names})
        #for temp in results:
        #    self.write('%s\n' % (temp))
        print(results)
        self.write({"response":results})

    def __format_post_result(self, response):
        response_dict = json.loads(response)
        concepts = response_dict['results'].keys()        
        if concepts:
            return ";;".join(concepts)
        else:
            return ""
Exemplo n.º 2
0
 def initialize(self,
                concepts_inlinks,
                stopwords,
                inlinks_threshold=400,
                MAX_WORDS=400,
                MAX_CHARS=2000):
     """
     """
     self.MAX_WORDS = MAX_WORDS
     self.MAX_CHARS = MAX_CHARS
     self.inlinks_threshold = inlinks_threshold
     self.ner = Ner(concepts_inlinks,
                    stopwords,
                    inlinks_threshold=inlinks_threshold,
                    max_words=MAX_WORDS)
Exemplo n.º 3
0
def initializeNer():
    concepts_inlinks = {}
    stopwords = set()
    entities = set()

    logging.info("Loading concepts...")
    with (open('../services/concept_service/data/pagelinks_all.tsv',
               encoding='utf-8',
               errors='ignore')) as concepts_file:
        for concept in concepts_file.readlines():
            parts = concept.split('\t')
            concepts_inlinks[parts[0]] = parts[1]
    logging.info("%s concepts loaded." % len(concepts_inlinks))

    logging.info("Loading stopwords...")
    with (open('../services/concept_service/data/stopwords.txt',
               encoding='utf-8',
               errors='ignore')) as sw_file:
        for sw in sw_file:
            stopwords.add(sw.replace('\n', '').lower())
    logging.info("%s stopwords loaded." % len(stopwords))
    return Ner(concepts_inlinks,
               entities,
               stopwords,
               inlinks_threshold=400,
               max_words=500)
Exemplo n.º 4
0
def main():
    # for docker docker.json
    if DOCKER_ENV in os.environ:
        cfg_path = './config/docker.json'
    else:
        cfg_path = './config/dev.json'
    # 1.
    config_processor = ConfigProcessor(cfg_path)
    cfg = config_processor.get_configs()
    print(cfg)

    global STORAGE_PATH
    STORAGE_PATH = cfg['local_storage_path']

    # 2. Ner
    ner_model = Ner('./ml/devlabs_ner_ontonotes_bert.json')
    #ret = ner_model.handle_file('/home/neurus/Projects/rvision-hackathon-2021-q1/converted/2020/11-2020-Chaes-e-commerce-malware-research.pdf.txt')
    #print(ret)

    # 3. RabbitMq
    amqp_processor = AmqpProcessor(cfg['rabbit_mq'])

    # 4. DB
    pgdb = PostgreSqlDatabase(cfg)

    # 5. Start to listen incoming messages
    try:
        channel = amqp_processor.establish_connection(
            create_callback(ner_model, pgdb))
        channel.start_consuming()
    except:
        pgdb.close()
        amqp_processor.close_connection()
Exemplo n.º 5
0
 def initialize(self, concepts_inlinks, stopwords, entities, inlinks_threshold=400, MAX_WORDS=400, MAX_CHARS=2000):
     """
     """
     self.MAX_WORDS = MAX_WORDS
     self.MAX_CHARS = MAX_CHARS
     self.ner = Ner(concepts_inlinks, entities, stopwords, inlinks_threshold=inlinks_threshold, max_words=MAX_WORDS)
Exemplo n.º 6
0
    default=3,
    help=
    'size of synthesized data. size of total data = (augment_size + 1) * the original data size'
)
parser.add_argument('--seed', type=int, default=42, help='random seed')

if __name__ == '__main__':
    args = parser.parse_args()
    file_name = args.file_name
    dedup = args.dedup
    augment_size = args.augment_size
    seed = args.seed

    ner = Ner(ner_dir_name='.',
              ignore_tag_list=['O'],
              data_augument_tag_list=data_augument_tag_list,
              augment_size=augment_size,
              seed=seed,
              dedup=dedup)

    aug_samples, aug_sample_tags = ner.augment(file_name='%s.txt' % file_name)

    if dedup:
        target_dir = 'aug_dedup_%d' % augment_size
    else:
        target_dir = 'aug_%d' % augment_size
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    with open(os.path.join(target_dir, '%s.txt' % file_name),
              'w',
              encoding='utf-8') as f:
        for tokens, tags in zip(aug_samples, aug_sample_tags):
Exemplo n.º 7
0
class NerService(tornado.web.RequestHandler):
    def initialize(self,
                   concepts_inlinks,
                   stopwords,
                   inlinks_threshold=400,
                   MAX_WORDS=400,
                   MAX_CHARS=2000):
        """
        """
        self.MAX_WORDS = MAX_WORDS
        self.MAX_CHARS = MAX_CHARS
        self.inlinks_threshold = inlinks_threshold
        self.ner = Ner(concepts_inlinks,
                       stopwords,
                       inlinks_threshold=inlinks_threshold,
                       max_words=MAX_WORDS)

    def get(self):
        # Get parameters
        inlinks_threshold = int(
            self.get_argument("inlinks_threshold",
                              default=self.inlinks_threshold))
        self.ner.inlinks_threshold = inlinks_threshold
        text = self.get_argument("text")
        debug = self.get_argument("debug", default=False)
        # Check warnings if exists
        warning = []
        if len(text) > self.MAX_CHARS:
            warning.append(
                'Only the first %d chars will be processed. This request is over this limit.'
                % self.MAX_CHARS)
        if len(text.split(' ')) > self.MAX_WORDS:
            warning.append(
                'Only the first %d words will be processed. This request is over this limit.'
                % self.MAX_WORDS)
        result = self.ner.fetch_entities(text)
        # Erase text at response
        del (result['text'])
        # if exists warning, append the flags to the output
        if len(warning) > 0:
            result['warnings'] = warning
        if debug:
            self.write(result)
        else:
            self.write({"concepts": list(result["results"].keys())})

    def post(self):
        results = list()
        for line in str(self.request.body, 'utf8').split('\n'):
            if line:
                fields = line.split('\t')
                text = fields[0]
                concepts = self.ner.fetch_entities(text)
                concept_names = list(concepts['results'].keys())
                results.append({"text": text, "concepts": concept_names})
        self.write({"response": results})

    def __format_post_result(self, response):
        response_dict = json.loads(response)
        concepts = response_dict['results'].keys()
        if concepts:
            return ";;".join(concepts)
        else:
            return ""
Exemplo n.º 8
0
 def setUp(self):
     # self.ner = BertNer() #Your implementation of NER
     self.input_file = 'ner_test_input.txt'
     self.output_file = 'ner_test_output.txt'
     self.pred = Ner.predict(self.input_file)