Exemplo n.º 1
0
    def get_entities(self, text, url):
        new_docs = []
        self.has_contacts = False
        if not text:
            return new_docs
        dom_element_text_key = hashlib.md5(text.encode()).hexdigest()
        try:
            if dom_element_text_key in self.cached_docs:
                new_docs = self.cached_docs[dom_element_text_key]
            else:
                self.soc_spacy.sendall(text.encode('utf8') + '--end--'.encode('utf8'))
                docs = json.loads(recv_end(self.soc_spacy))

                for doc in docs:
                    # if the phone is not valid for the country ignore it
                    if doc['label'] == 'PHONE':
                        valid_phone = WebsiteContactMeta.get_valid_country_phone(self.country_codes, doc['text'])
                        if valid_phone:
                            doc['text'] = valid_phone
                        else:
                            continue
                    new_docs.append(doc)

                self.cached_docs[dom_element_text_key] = new_docs
        except Exception as ve:
            logger.error("%s : %s", url, ve)
            return new_docs

        for doc in new_docs:
            # if the phone is not valid for the country ignore it
            if doc['label'] in ['PHONE', 'EMAIL']:
                self.has_contacts = True

        return new_docs
Exemplo n.º 2
0
    def handle(self, *args, **options):
        self.stdout.write(str(time.time()), ending='\n')
        self.stdout.write("Starting server", ending='\n')
        self.stdout.write("Loading model", ending='\n')
        categories = CategoryWebsiteText.load_ml_data()
        MLUtils.prepare(categories)
        self.stdout.write("Model loaded", ending='\n')
        self.stdout.write(str(time.time()), ending='\n')
        main_socks, read_socks, write_socks = socket_bind('', 50008)

        while True:
            readable, writeable, exceptions = select(read_socks, write_socks,
                                                     [])
            for sock_obj in readable:
                if sock_obj in main_socks:
                    new_sock, address = sock_obj.accept()
                    print('Connect:', address, id(new_sock))
                    read_socks.append(new_sock)
                else:
                    try:
                        data = recv_end(sock_obj)
                        if not data:
                            sock_obj.close()
                            read_socks.remove(sock_obj)
                        else:
                            res = MLUtils.predict_category(data)
                            sock_obj.sendall(
                                json.dumps(res).encode('utf8') +
                                '--end--'.encode('utf8'))
                    except:
                        pass
Exemplo n.º 3
0
    def handle(self, *args, **options):
        spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER, disable=['parser', 'tagger', 'textcat'])
        Span.set_extension('is_phone', getter=Command.is_phone_getter, force=True)
        Span.set_extension('line_number', getter=Command.line_number_getter, force=True)
        Doc.set_extension('lines', getter=Command.get_lines, setter=Command.set_lines)
        Doc.set_extension('_lines', default=list())

        logger.debug("Loaded spacy server")
        main_socks, read_socks, write_socks = socket_bind('', settings.SPACY_PORT)
        while True:
            readable, writeable, exceptions = select(read_socks, write_socks, [])
            for sockobj in readable:
                if sockobj in main_socks:
                    new_sock, address = sockobj.accept()
                    logger.debug('Connect: %s - %s', address, id(new_sock))
                    read_socks.append(new_sock)
                else:
                    try:
                        entities = []
                        data = recv_end(sockobj)
                        if not data:
                            sockobj.close()
                            read_socks.remove(sockobj)
                        else:
                            for doc in spacy_model.pipe([data]):
                                doc._.lines = [x.start() for x in re.finditer('\n', doc.text)]
                                for ent in doc.ents:
                                    current_entity = self.get_ent(ent)
                                    entities.append(current_entity) if current_entity else None

                            sockobj.sendall(json.dumps(entities).encode('utf8') + '--end--'.encode('utf8'))
                    except:
                        pass
Exemplo n.º 4
0
def category(request):
    """
    get the category and text-language
    :param request:
    :return:
    """
    website_id = request.POST.get('website_id', '')
    # category socket
    soc_category = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    soc_category.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    connect(soc_category, '', 50008)

    text = request.POST.get('text', '')
    category_ids = []

    if text:
        try:
            soc_category.sendall(
                text.encode('utf8') + '--end--'.encode('utf8'))
            category_ids = json.loads(recv_end(soc_category))
        except:
            logger.info(str(website_id) + "error")

    soc_category.close()

    return HttpResponse(json.dumps(category_ids),
                        content_type='application/json')
 def handle(self, *args, **options):
     self.stdout.write("Loaded location server", ending='\n')
     main_socks, read_socks, write_socks = socket_bind('', 50006)
     while True:
         readable, writeable, exceptions = select(read_socks, write_socks,
                                                  [])
         for sockobj in readable:
             if sockobj in main_socks:
                 new_sock, address = sockobj.accept()
                 print('Connect:', address, id(new_sock))
                 read_socks.append(new_sock)
             else:
                 try:
                     data = recv_end(sockobj)
                     if not data:
                         sockobj.close()
                         read_socks.remove(sockobj)
                     else:
                         new_data = parse_address(data)
                         sockobj.sendall(
                             json.dumps(new_data).encode('utf8') +
                             '--end--'.encode('utf8'))
                 except:
                     pass
Exemplo n.º 6
0
def get_location(soc, text, country_code):
    final_locations = {}
    unverified_locations = {}

    saved_address = collections.deque(maxlen=3)
    saved_sentences = collections.deque(maxlen=3)

    saved_socks = {}
    t2 = 0

    sentences = text.split('\n')
    for sentence in sentences:

        sentence = sentence.strip()

        sentence = re.sub("[^\\w.,-]", " ", sentence)
        sentence = re.sub(" +", " ", sentence)

        sentence_key = re.sub('[^A-Za-z]', '', sentence.lower())

        if len(sentence_key) <= 1:
            continue

        if sentence_key in saved_socks:
            address = saved_socks[sentence_key]
        else:
            soc.sendall(sentence.encode('utf8') + '--end--'.encode('utf8'))
            address = json.loads(recv_end(soc))
            saved_socks[sentence_key] = address

        if not is_valid(address, saved_address, saved_sentences):
            continue

        saved_address.append(address)
        saved_sentences.append(sentence)

        single_address = {}
        for locations in filter(None, list(saved_address)):
            single_address.update({
                value[1]: value[0].strip()
                for value in locations if value[1] not in ignored
            })

        set_country_code(single_address, country_code)
        key_intersect = list(set(single_address.keys()) & set(keys))
        secondary_key_intersect = list(
            set(single_address.keys()) & set(secondary_keys))

        verified_address = None
        unverified_address = None
        if len(key_intersect) > 1:
            t1 = time.time()
            verified_address = get_verified_address(single_address)
            t2 += time.time() - t1

        elif len(final_locations.values()
                 ) == 0 and len(key_intersect) > 0 and len(
                     secondary_key_intersect) > 1 and 'city' in single_address:
            # this has been done to counter 'this has been done on 2014/25', which would have been translated to
            # nigeria, on, number: 2014...
            t1 = time.time()
            unverified_address = get_verified_address(single_address)
            t2 += time.time() - t1

        if verified_address or unverified_address:
            current_address = verified_address if verified_address else unverified_address
            for key in keys:
                if key in single_address:
                    single_address.pop(key, None)
            current_address.update(single_address)

            if verified_address:
                add_to_final_locations(current_address, final_locations)
            elif unverified_address:
                add_to_final_locations(current_address, unverified_locations)

            saved_address = collections.deque(maxlen=3)
            saved_sentences = collections.deque(maxlen=3)

    return_locations = []

    if len(final_locations.values()) > 0:
        return_locations = list(final_locations.values())
    elif len(unverified_locations) > 0:
        return_locations = list(unverified_locations.values())

    return return_locations