예제 #1
0
class ESDiffs(object):
    """Implementation of Elastic Search as diff backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    @staticmethod
    def to_id(label, old, new):
        return "%s/%s/%s" % (label, old, new)

    def put(self, label, old_version, new_version, diff):
        """Store a diff between two versions of a regulation node"""
        struct = {
            'label': label,
            'old_version': old_version,
            'new_version': new_version,
            'diff': diff
        }
        self.es.index(settings.ELASTIC_SEARCH_INDEX, 'diff', struct,
                      id=self.to_id(label, old_version, new_version))

    def get(self, label, old_version, new_version):
        """Find the associated diff"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'diff',
                                 self.to_id(label, old_version, new_version))
            return result['_source']['diff']
        except ElasticHttpNotFoundError:
            return None
예제 #2
0
파일: import.py 프로젝트: jucabot/palmyr
def load_pie(
    filename, index_name, type_name, category, name, zone="France", sep=";", display="pie", source="", description=""
):

    f = open(filename, mode="r")
    es = ElasticSearch(CONTEXT["datahub-store"])

    categories = {}
    for line in f:
        key, string_value = line.split(sep, 2)
        value = cjson.decode(string_value)

        categories[key] = value

    serie = {
        "name": name,
        "owner": "public",
        "display": display,
        "zone": zone,
        "category": category,
        "source": source,
        "description": description % (key),
        "data": {"categories": categories.keys(), "series": [{"data": categories.values()}]},
    }
    es.index(index_name, display, serie)

    es.refresh(index_name)
    f.close()

    es.refresh(index_name)
    f.close()
예제 #3
0
def update_process_datetime(doc_id, timestamp):
  ''' Updates the last_update_date for the document id passed into function.
    The document id in will be the name of another index in the cluster.
  '''
  connection_string = 'http://localhost:9200'
  process_index = 'openfdametadata'
  _type = 'last_run'
  _map = {}
  _map[_type] = {}
  _map[_type]['properties'] = {}
  _map[_type]['properties']['last_update_date'] = {}
  _map[_type]['properties']['last_update_date']['type'] = 'date'
  _map[_type]['properties']['last_update_date']['format'] = 'dateOptionalTime'

  es = ElasticSearch(connection_string)
  try:
    es.create_index(process_index)
    logging.info('Creating index %s', process_index)
  except exceptions.IndexAlreadyExistsError as e:
    logging.info('%s already exists', process_index)

  try:
    es.put_mapping(process_index, doc_type=_type, mapping=_map)
    logging.info('Successfully created mapping')
  except:
    logging.fatal('Could not create the mapping')

  new_doc = {}
  new_doc['last_update_date'] = timestamp
  es.index(process_index,
           doc_type=_type,
           id=doc_id,
           doc=new_doc,
           overwrite_existing=True)
예제 #4
0
class ESDiffs(object):
    """Implementation of Elastic Search as diff backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    @staticmethod
    def to_id(label, old, new):
        return "%s/%s/%s" % (label, old, new)

    def put(self, label, old_version, new_version, diff):
        """Store a diff between two versions of a regulation node"""
        struct = {
            'label': label,
            'old_version': old_version,
            'new_version': new_version,
            'diff': diff
        }
        self.es.index(settings.ELASTIC_SEARCH_INDEX,
                      'diff',
                      struct,
                      id=self.to_id(label, old_version, new_version))

    def get(self, label, old_version, new_version):
        """Find the associated diff"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'diff',
                                 self.to_id(label, old_version, new_version))
            return result['_source']['diff']
        except ElasticHttpNotFoundError:
            return None
예제 #5
0
class ESNotices(object):
    """Implementation of Elastic Search as notice backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def put(self, doc_number, notice):
        """Store a single notice"""
        self.es.index(settings.ELASTIC_SEARCH_INDEX, 'notice', notice,
                      id=doc_number)

    def get(self, doc_number):
        """Find the associated notice"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice',
                                 doc_number)

            return result['_source']
        except ElasticHttpNotFoundError:
            return None

    def listing(self, part=None):
        """All notices or filtered by cfr_part"""
        if part:
            query = {'match': {'cfr_part': part}}
        else:
            query = {'match_all': {}}
        query = {'fields': ['effective_on', 'fr_url', 'publication_date'],
                 'query': query}
        notices = []
        results = self.es.search(query, doc_type='notice', size=100,
                                 index=settings.ELASTIC_SEARCH_INDEX)
        for notice in results['hits']['hits']:
            notice['fields']['document_number'] = notice['_id']
            notices.append(notice['fields'])
        return notices
예제 #6
0
def update_process_datetime(doc_id, timestamp):
    ''' Updates the last_update_date for the document id passed into function.
    The document id in will be the name of another index in the cluster.
  '''
    connection_string = 'http://localhost:9200'
    process_index = 'openfdametadata'
    _type = 'last_run'
    _map = {}
    _map[_type] = {}
    _map[_type]['properties'] = {}
    _map[_type]['properties']['last_update_date'] = {}
    _map[_type]['properties']['last_update_date']['type'] = 'date'
    _map[_type]['properties']['last_update_date'][
        'format'] = 'dateOptionalTime'

    es = ElasticSearch(connection_string)
    try:
        es.create_index(process_index)
        logging.info('Creating index %s', process_index)
    except exceptions.IndexAlreadyExistsError as e:
        logging.info('%s already exists', process_index)

    try:
        es.put_mapping(process_index, doc_type=_type, mapping=_map)
        logging.info('Successfully created mapping')
    except:
        logging.fatal('Could not create the mapping')

    new_doc = {}
    new_doc['last_update_date'] = timestamp
    es.index(process_index,
             doc_type=_type,
             id=doc_id,
             doc=new_doc,
             overwrite_existing=True)
def main():
    #Train the Naive Bayes Classifier
    f=open('./data_set/naivebayes_trained_model.pickle')
    NBClassifier=pickle.load(f)

    #ElasticSearch- Call the es_indexer file to create 'sentiment_analysis' index and store
    #the contents of the tweet file in that Index
    
    es=ElasticSearch('http://localhost:9200/')
    es_indexer()
    ############Indexing into Elasticsearch############
    i=0
    for each in tweet_data():
        i+=1
        testTweet= each
        processedTestTweet=process_tweet(testTweet)
        sentiment=NBClassifier.classify(extract_features(build_feature_vector(processedTestTweet)))
    
        
        es.index("sentiment_analysis","document",{
                     "text": testTweet,
                     "sentiment": sentiment
                         },id=i)
    print "Indexing completed."

    es.refresh(index="sentiment_analysis")
    print "Index refreshed."

    f.close()
예제 #8
0
class ESPipeline(object):
    def __init__(self, *args, **kwargs):
        self.client = ElasticSearch('http://localhost:9200/')

    def process_item(self, item, spider):
        self.client.index('wiki', 'page', dict(item))
        return item
예제 #9
0
class ItvacaturesParseStrategy(ParseStrategy.ParseStrategy):
    def __init__(self):
        self.website = "it-vacatures"
        # elasticsearch binden aan es
        self.es = ElasticSearch("http://localhost:9200/")

    def parseTitel(self, soup):
        titel = soup.head.title.string
        return titel

    def parseWerkgever(self, soup):
        info = soup.find("td")
        infoTwee = info.find_next_sibling()
        p = re.compile(r"<.*?>")
        werkgever = p.sub("", str(infoTwee))
        return werkgever

    def parseLocatie(self, soup):
        info = soup.find("td")
        infoTwee = info.find_next_sibling()
        locatieEen = infoTwee.find_next()
        p = re.compile(r"<.*?>")
        locatieTwee = p.sub("", str(locatieEen))
        p = re.compile(r"Locatie")
        locatie = p.sub("", str(locatieTwee))
        locatie = locatie.strip()
        return locatie

    def parseInhoud(self, soup):
        body = soup.find("div", {"id": "job-description"})
        p = re.compile(r"<.*?>")
        inhoud = p.sub("", str(body))
        return inhoud

    def parse(self, websiteUrl):
        soup = self.getSoup(websiteUrl)

        # parsen
        titel = self.parseTitel(soup)
        try:
            werkgever = self.parseWerkgever(soup)
        except:
            werkgever = "-"
        try:
            locatie = self.parseLocatie(soup)
        except:
            locatie = "-"
        inhoud = self.parseInhoud(soup)
        websiteUrl = re.sub(r"(?s)/\*.*\*/", "", websiteUrl)
        datum = time.strftime("%d-%m-%Y")
        # generate id (string)
        id = self.website + "-" + re.sub(r"\W+", "", titel)

        # make document to be send to elasticsearch database
        document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud)

        # indexeren (stoppen) van vacaturen in esDb
        self.es.index("vacature-index", "vacature", document, id=document["id"])
        print "Es: " + titel
예제 #10
0
class IitjobsParseStrategy(ParseStrategy.ParseStrategy):
    def __init__(self):
        self.website = "iitjobs"
        # elasticsearch binden aan es
        self.es = ElasticSearch('http://localhost:9200/')

    def parseTitel(self, soup):
        titel = soup.head.title.string
        titel = titel.strip()
        return titel

    def parseWerkgever(self, soup):
        body = soup.find(
            "span",
            {"id": "ctl00_middleContent_idShowJobDetails_lblCompanyName"})
        p = re.compile(r'<.*?>')
        werkgever = p.sub('', str(body))
        werkgever = werkgever.strip()
        return werkgever

    def parseLocatie(self, soup):
        body = soup.find(
            "span",
            {"id": "ctl00_middleContent_idShowJobDetails_lblCountryID"})
        p = re.compile(r'<.*?>')
        locatie = p.sub('', str(body))
        locatie = locatie.strip()
        return locatie

    def parseInhoud(self, soup):
        body = soup.find("div", {"id": "divJobDescrip"})
        p = re.compile(r'<.*?>')
        inhoud = p.sub('', str(body))
        inhoud = inhoud.strip()
        return inhoud

    def parse(self, websiteUrl):
        soup = self.getSoup(websiteUrl)

        #parsen
        titel = self.parseTitel(soup)
        werkgever = self.parseWerkgever(soup)
        locatie = self.parseLocatie(soup)
        inhoud = self.parseInhoud(soup)
        websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl)
        datum = time.strftime("%d-%m-%Y")
        # generate id for website (string)
        id = self.website + "-" + re.sub(r'\W+', '', titel)

        # make document to be send to elasticsearch database
        document = self.makeDocument(id, titel, websiteUrl, self.website,
                                     datum, werkgever, locatie, "-", inhoud)
        #indexeren (stoppen) van vacaturen in esDb
        self.es.index('vacature-index',
                      'vacature',
                      document,
                      id=document['id'])
        print('Es: ' + titel)
예제 #11
0
class IctergezochtParseStrategy(ParseStrategy.ParseStrategy):
    def __init__(self):
        self.website = "ictergezocht"
        # elasticsearch binden aan es
        self.es = ElasticSearch("http://localhost:9200/")

    def parseWerkgever(self, soup):
        info = soup.find(class_="highlight")
        p = re.compile(r"<.*?>")
        werkgever = p.sub("", str(info))
        return werkgever

    def parseLocatie(self, soup):
        infoTwee = soup.find(class_="bf")
        locatieEen = infoTwee.find_next()
        locatieTwee = locatieEen.find_next()
        locatieDrie = locatieTwee.find_next()
        locatieVier = locatieDrie.find_next()
        p = re.compile(r"<.*?>")
        locatieVijf = p.sub("", str(locatieVier))
        p = re.compile(r"Locatie")
        locatie = p.sub("", str(locatieVijf))
        locatie = locatie.strip()
        return locatie

    def parseInhoud(self, soup):
        body = soup.find(class_="vacancybody")
        p = re.compile(r"<.*?>")
        inhoud = p.sub("", str(body))
        return inhoud

    def parseTitel(self, soup):
        titel = soup.head.title.string
        return titel

    def parse(self, websiteUrl):
        soup = self.getSoup(websiteUrl)

        titel = self.parseTitel(soup)
        if titel.startswith("Vacature"):
            # parsen
            werkgever = self.parseWerkgever(soup)
            locatie = self.parseLocatie(soup)
            inhoud = self.parseInhoud(soup)
            websiteUrl = re.sub(r"(?s)/\*.*\*/", "", websiteUrl)
            datum = time.strftime("%d-%m-%Y")
            # generate id website (string)
            id = self.website + "-" + re.sub(r"\W+", "", titel)

            # make document
            document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud)
            # indexeren (stoppen) van vacaturen in esDb
            self.es.index("vacature-index", "vacature", document, id=document["id"])
            print "Es: " + titel
예제 #12
0
class IctergezochtParseStrategy(ParseStrategy.ParseStrategy):
    def __init__(self):
        self.website = "ictergezocht"
        # elasticsearch binden aan es
        self.es = ElasticSearch('http://localhost:9200/')

    def parseWerkgever(self, soup):
        info = soup.find(class_="highlight")
        p = re.compile(r'<.*?>')
        werkgever = p.sub('', str(info))
        return werkgever

    def parseLocatie(self, soup):
        infoTwee = soup.find(class_="bf")
        locatieEen = infoTwee.find_next()
        locatieTwee = locatieEen.find_next()
        locatieDrie = locatieTwee.find_next()
        locatieVier = locatieDrie.find_next()
        p = re.compile(r'<.*?>')
        locatieVijf = p.sub('', str(locatieVier))
        p = re.compile(r'Locatie')
        locatie = p.sub('', str(locatieVijf))
        locatie = locatie.strip()
        return locatie

    def parseInhoud(self, soup):
        body = soup.find(class_="vacancybody")
        p = re.compile(r'<.*?>')
        inhoud = p.sub('', str(body))
        return inhoud

    def parseTitel(self, soup):
        titel = soup.head.title.string
        return titel

    def parse(self, websiteUrl):
        soup = self.getSoup(websiteUrl)

        titel = self.parseTitel(soup)
        if titel.startswith("Vacature"):
            #parsen
            werkgever = self.parseWerkgever(soup)
            locatie = self.parseLocatie(soup)
            inhoud = self.parseInhoud(soup)
            websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl)
            datum = time.strftime("%d-%m-%Y")
            # generate id website (string)
            id = self.website + "-" + re.sub(r'\W+', '', titel)

            #make document
            document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud)
            #indexeren (stoppen) van vacaturen in esDb
            self.es.index('vacature-index', 'vacature', document, id=document['id'])
            print "Es: " + titel
예제 #13
0
파일: tools.py 프로젝트: frecar/postguider
def dump_one_and_one_post_elasticsearch(token):
    es = ElasticSearch('http://localhost:9200/')

    relevant_posts = []

    for element in Newsfeed.newsfeed(token, [], 0, None, 1000):
        if 'from' in element and 'category' in element['from']:
            continue

        post = Post(element, token)

        es.index(token.lower(), "post", post.serialize())
예제 #14
0
class IitjobsParseStrategy(ParseStrategy.ParseStrategy):
    def __init__(self):
        self.website = "iitjobs"
        # elasticsearch binden aan es
        self.es = ElasticSearch('http://localhost:9200/')

    def parseTitel(self, soup):
        titel = soup.head.title.string
        titel = titel.strip()
        return titel

    def parseWerkgever(self, soup):
        body = soup.find("span", {"id": "ctl00_middleContent_idShowJobDetails_lblCompanyName"})
        p = re.compile(r'<.*?>')
        werkgever = p.sub('', str(body))
        werkgever = werkgever.strip()
        return werkgever

    def parseLocatie(self, soup):
        body = soup.find("span", {"id": "ctl00_middleContent_idShowJobDetails_lblCountryID"})
        p = re.compile(r'<.*?>')
        locatie = p.sub('', str(body))
        locatie = locatie.strip()
        return locatie

    def parseInhoud(self, soup):
        body = soup.find("div", {"id": "divJobDescrip"})
        p = re.compile(r'<.*?>')
        inhoud = p.sub('', str(body))
        inhoud = inhoud.strip()
        return inhoud

    def parse(self, websiteUrl):
        soup = self.getSoup(websiteUrl)

        #parsen
        titel = self.parseTitel(soup)
        werkgever = self.parseWerkgever(soup)
        locatie = self.parseLocatie(soup)
        inhoud = self.parseInhoud(soup)
        websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl)
        datum = time.strftime("%d-%m-%Y")
        # generate id for website (string)
        id = self.website + "-" + re.sub(r'\W+', '', titel)

        # make document to be send to elasticsearch database
        document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud)
        #indexeren (stoppen) van vacaturen in esDb
        self.es.index('vacature-index', 'vacature', document, id=document['id'])
        print('Es: ' + titel)
예제 #15
0
class ElasticPush(Handler):
    """Posts events to ES."""

    def __init__(self, host='localhost', dest=[]):
        Handler.__init__(self, dest=dest)
        self.es = ElasticSearch('http://%s:9200/' % (host))
        self.source_host = socket.gethostname()

    def push(self, data):
        self.debug("Pushing data %s to elastic search" % (data))
        event = ElasticEvent(data)
        self.es.index("carmanor", "line", event.dict())

        Handler.push(self, data)
예제 #16
0
파일: tools.py 프로젝트: frecar/postguider
def dump_relevant_newsfeed_to_elasticsearch(token):
    es = ElasticSearch('http://localhost:9200/')

    relevant_posts = []

    for element in Newsfeed.newsfeed(token, [], 0, None, 1000):
        if 'from' in element and 'category' in element['from']:
            continue

        post = Post(element, token)
        relevant_posts.append(post.serialize())

    data = {'posts': relevant_posts}
    es.index(token.lower(), "post", data, id=1)
예제 #17
0
파일: views.py 프로젝트: Taraka16/neonion-1
def resource_create(request, index):
    data = json.loads(request.POST['data'])
    data['new'] = True
    # random identifier
    data['uri'] = ''.join(random.choice('0123456789ABCDEF') for i in range(32))

    # store data in elasticsearch
    es = ElasticSearch(settings.ELASTICSEARCH_URL)
    if index == 'persons':
        es.index(index, "person", data)
    elif index == 'institutes':
        es.index(index, "institute", data)
    es.refresh(index)

    return JsonResponse(data)
예제 #18
0
파일: views.py 프로젝트: Taraka16/neonion-1
def resource_create(request, index):
    data = json.loads(request.POST['data'])
    data['new'] = True
    # random identifier
    data['uri'] = ''.join(random.choice('0123456789ABCDEF') for i in range(32))

    # store data in elasticsearch
    es = ElasticSearch(settings.ELASTICSEARCH_URL)
    if index == 'persons':
        es.index(index, "person", data)
    elif index == 'institutes':
        es.index(index, "institute", data)
    es.refresh(index)

    return JsonResponse(data)
예제 #19
0
파일: scrape.py 프로젝트: yychen/estest
def main():
    # url = u'https://blog.gslin.org/archives/2015/01/22/5548/backblaze-%E5%85%AC%E4%BD%88%E7%A1%AC%E7%A2%9F%E6%95%85%E9%9A%9C%E7%8E%87/'
    url = u'http://yychen.joba.cc/dev/archives/164'
    es = ElasticSearch(HOST)
    for i in range(20):
        item, url = get_page(url)

        if not url:
            print '\033[1;33mWe\'ve reached the end, breaking...\033[m'
            break

        # put it into es
        print 'Indexing \033[1;37m%s\033[m (%s)...' % (item['title'],
                                                       item['url'])
        es.index(INDEX, DOCTYPE, doc=item, id=item['url'])
예제 #20
0
class WeatherDatabase(object):
    def __init__(self, server='http://0.0.0.0:9901'):
        self.server = server
        self.es = ElasticSearch(server)

    def index(self, data):
        return self.es.index('weather', 'sensor', data)
예제 #21
0
class WeatherDatabase(object):

    def __init__(self, server='http://0.0.0.0:9901'):
        self.server = server
        self.es = ElasticSearch(server)

    def index(self, data):
        return self.es.index('weather', 'sensor', data)
예제 #22
0
class SensorDatabase(object):

    def __init__(self, server='http://0.0.0.0:9901'):
        self.server = server
        self.es = ElasticSearch(server)

    def index(self, sensor_id, data):
        return self.es.index('domotic', 'sensor_values', data)
예제 #23
0
 def test_cluster_size_3(self):
     cluster = self._make_one(size=3)
     cluster.start()
     self.assertEqual(len(cluster), 3)
     self.assertEqual(len(cluster.hosts), 3)
     self.assertEqual(len(os.listdir(cluster.working_path)), 3)
     self.assertEqual(len(cluster.urls), 3)
     client = ElasticSearch(cluster.urls, max_retries=2)
     self.assertEqual(client.health()['number_of_nodes'], 3)
     # test if routing works and data is actually distributed across nodes
     client.create_index('test_shards', settings={
         'number_of_shards': 1,
         'number_of_replicas': 2,
     })
     client.index('test_shards', 'spam', {'eggs': 'bacon'})
     client.refresh('test_shards')
     shard_info = client.status()['indices']['test_shards']['shards']['0']
     nodes = set([s['routing']['node'] for s in shard_info])
     self.assertTrue(len(nodes) > 1)
예제 #24
0
class ESNotices(object):
    """Implementation of Elastic Search as notice backend"""
    def __init__(self):
        self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS)

    def put(self, doc_number, notice):
        """Store a single notice"""
        self.es.index(settings.ELASTIC_SEARCH_INDEX,
                      'notice',
                      notice,
                      id=doc_number)

    def get(self, doc_number):
        """Find the associated notice"""
        try:
            result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice',
                                 doc_number)

            return result['_source']
        except ElasticHttpNotFoundError:
            return None

    def listing(self, part=None):
        """All notices or filtered by cfr_part"""
        if part:
            query = {'match': {'cfr_parts': part}}
        else:
            query = {'match_all': {}}
        query = {
            'fields': ['effective_on', 'fr_url', 'publication_date'],
            'query': query
        }
        notices = []
        results = self.es.search(query,
                                 doc_type='notice',
                                 size=100,
                                 index=settings.ELASTIC_SEARCH_INDEX)
        for notice in results['hits']['hits']:
            notice['fields']['document_number'] = notice['_id']
            notices.append(notice['fields'])
        return notices
class ElasticSearchPublisher(Publisher):
    '''
    Publishes to an ElasticSearch Index
    '''
    def __init__(self, elasticsearch_url, index_name, **kwargs):
        self.elasticsearch_url = elasticsearch_url
        self.index_name = index_name
        self.connection = ElasticSearch(self.elasticsearch_url)
    
    def publish(self, event, message, event_id):
        response = self.connection.index(self.index_name, event, message, id=event_id)
        return response
예제 #26
0
파일: import.py 프로젝트: jucabot/palmyr
def load_wordcloud(
    filename,
    index_name,
    type_name,
    category,
    name,
    zone="France",
    sep=";",
    display="wordcloud",
    source="",
    description="",
):

    f = open(filename, mode="r")
    es = ElasticSearch(CONTEXT["datahub-store"])

    categories = []
    for line in f:
        key, string_value = line.split(sep, 2)
        value = cjson.decode(string_value)

        categories.append((value["label"], value["norm_count"] / 100.0))

    serie = {
        "name": name,
        "owner": "public",
        "display": display,
        "zone": zone,
        "category": category,
        "source": source,
        "description": unicode(description) % (name),
        "data": {
            "categories": map(lambda item: item[0], categories),
            "series": [{"data": map(lambda item: item[1], categories)}],
        },
    }
    es.index(index_name, display, serie)

    es.refresh(index_name)
    f.close()
예제 #27
0
	def indexLookup(self):

		es = ElasticSearch('http://104.236.54.204:9200')

		with open("author_lookup.json", "r") as f:

			lookups = json.loads(f.read())
			for lookup in lookups:

				doc = {}

				doc['id'] = lookup
					
				titles = []

				for x in lookups[lookup]:

					print (x)
					t = str(x[0]) + "|" + x[1]
					titles.append(t)

				doc['titles'] = titles





				try:
					es.index(index="titles", doc_type="title", id=lookup, doc=doc)

				except exceptions.ElasticHttpError as e:

					print("Error on this one")
					print(doc["id"])
					print(str(e))


				print (lookup)
예제 #28
0
	def indexAuthors(self):

		es = ElasticSearch('http://localhost:9200')

		with open("allData.json", "r") as f:

			authors = json.loads(f.read())

			for author in authors:

				#i don't want to add the about as a topic right now
				authors[author]['groups']['about'] = []

				print ("Doing", author)

				try:
					es.index(index="gutenberg", doc_type="author", id=authors[author]['id'], doc=authors[author])
				except exceptions.ElasticHttpError as e:


					print ("-----------------")
					print ("Error indexing this author:",author)
					print (e)
					print ("-----------------")
예제 #29
0
class ElasticConnector(Connector):
    """
    Class for connectors that are operate with elasticsearch database
  """
    MAX_SIZE = 1000

    def __init__(self, database, host='http://localhost:9200/'):
        self.client = ElasticSearch(host)
        self.index = database
        self.create_index()

    def query_to_id(self, query):
        """
      Returns id representation of a specified query
      This is a temporary method as a replacement of elasticsearch query search
    """
        return "_".join(str(k) + "_" + str(v)
                        for k, v in query.items()).replace("/", "_")

    def create_index(self):
        """
      Creates specified index or catches an exception if it has already been created
    """
        try:
            self.client.create_index(self.index)
        except Exception as e:
            pass

    def set_dynamic_mapping(self, collection):
        """
      Sets dynamic mapping for a specified document type
    """
        self.client.put_mapping(self.index, collection, {'dynamic': True})

    def save_block(self, block):
        """
      Saves operation info in a database
    """
        super().save_block(block)
        collection = block.get_collection()
        dictionary = block.to_dict()
        query = block.get_query()
        self.update_by_query(collection, query, block)

    def update_by_query(self, collection, query, document):
        """
      Sets dynamic mapping for a specified collection,
      then creates a new id for a document depending on query for it.
      Saves a new object in a database as a new one
    """
        try:
            self.set_dynamic_mapping(collection)
            document_id = document.get_id()
            document_body = document.to_dict()
            if "_id" in document_body.keys():
                del document_body['_id']
            self.client.index(self.index,
                              collection,
                              document_body,
                              id=self.query_to_id(query))
        except Exception as e:
            print(e)
            pass

    def find_last_block(self):
        """
      Finds last block index as a value field of a document 
      in a status collection with specified id
    """
        try:
            document = self.client.get(self.index, 'status',
                                       'height_all_tsx')['_source']
            return document['value']
        except ElasticHttpNotFoundError as e:
            return 0

    def update_last_block(self, last_block):
        """
      Updates last block index as a value field of a document 
      in a status collection with specified id
    """
        self.client.index(self.index,
                          'status', {'value': last_block},
                          id='height_all_tsx')

    def save_instance(self, instance):
        """
      Saves account or comment object
    """
        self.update_by_query(instance.get_collection(), instance.get_query(),
                             instance)

    def get_instances_to_update(self, collection):
        """
      Finds and returns all dictionaries with objects that should be updated
    """
        hits = self.client.search("need_update:true",
                                  index=self.index,
                                  doc_type=collection,
                                  size=self.MAX_SIZE)['hits']['hits']
        return [{**hit['_source'], **{"_id": hit["_id"]}} for hit in hits]

    def update_instances(self, collection, instances):
        """
      Resets need_update flag for all instances in a list by their ids in _id field
    """
        for instance in instances:
            self.client.update(self.index,
                               collection,
                               instance["_id"],
                               doc={'need_update': False})
예제 #30
0
    except ValueError:
        return False


def isflt(string):
    try:
        float(string)
        return True
    except ValueError:
        return False


for item in root.findall('artikel'):
    list.append(articles)
    articles = {}
    for subitem in item:
        if subitem.text is not None:
            if isint(subitem.text):
                articles[subitem.tag] = int(subitem.text)
            elif isflt(subitem.text):
                articles[subitem.tag] = float(subitem.text)
            else:
                articles[subitem.tag] = subitem.text


for i, article in enumerate(list):
    if len(article) == 0:
        print "Empty value found"
    else:
        print es.index('articles3', 'article', article, id=i)
예제 #31
0
try:
    es.create_index('recast')
except IndexAlreadyExistsError, e:
    pass


r = requests.get(ELASTIC_SEARCH_URL)
i=1
while r.status_code == 200:
    url = 'http://recast-rest-api.herokuapp.com/analysis/{}'.format(i)
    r = requests.get(url)
    if not r.status_code == 200:
        break

    data = cleanJson(r.content)
    es.index('recast', 'analysis', json.dumps(data))
    i = i+1


r = requests.get(ELASTIC_SEARCH_URL)
i=1
while r.status_code == 200:
    url = 'http://recast-rest-api.herokuapp.com/requests/{}'.format(i)
    r = requests.get(url)
    if not r.status_code == 200:
        break
    
    data = cleanJson(r.content)
    es.index('recast', 'requests', json.dumps(data))
    i = i+1
예제 #32
0
class Elastic(DataLayer):
    """ElasticSearch data layer."""

    serializers = {
        'integer': int,
        'datetime': parse_date
    }

    def init_app(self, app):
        app.config.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200/')
        app.config.setdefault('ELASTICSEARCH_INDEX', 'eve')
        self.es = ElasticSearch(app.config['ELASTICSEARCH_URL'])
        self.index = app.config['ELASTICSEARCH_INDEX']

    def _get_field_mapping(self, schema):
        """Get mapping for given field schema."""
        if schema['type'] == 'datetime':
            return {'type': 'date'}
        elif schema['type'] == 'string' and schema.get('unique'):
            return {'type': 'string', 'index': 'not_analyzed'}
        elif schema['type'] == 'string':
            return {'type': 'string'}

    def put_mapping(self, app):
        """Put mapping for elasticsearch for current schema.

        It's not called automatically now, but rather left for user to call it whenever it makes sense.
        """
        for resource, resource_config in app.config['DOMAIN'].items():
            properties = {}
            properties[config.DATE_CREATED] = self._get_field_mapping({'type': 'datetime'})
            properties[config.LAST_UPDATED] = self._get_field_mapping({'type': 'datetime'})

            for field, schema in resource_config['schema'].items():
                field_mapping = self._get_field_mapping(schema)
                if field_mapping:
                    properties[field] = field_mapping

            datasource = (resource, )  # TODO: config.SOURCES not available yet (self._datasource_ex(resource))
            mapping = {}
            mapping[datasource[0]] = {'properties': properties}
            self.es.put_mapping(self.index, datasource[0], mapping)

    def find(self, resource, req, sub_resource_lookup):
        """
        TODO: implement sub_resource_lookup
        """
        query = {
            'query': {
                'query_string': {
                    'query': request.args.get('q', '*'),
                    'default_field': request.args.get('df', '_all'),
                    'default_operator': 'AND'
                }
            }
        }

        if not req.sort and self._default_sort(resource):
            req.sort = self._default_sort(resource)

        # skip sorting when there is a query to use score
        if req.sort and 'q' not in request.args:
            query['sort'] = []
            sort = ast.literal_eval(req.sort)
            for (key, sortdir) in sort:
                sort_dict = dict([(key, 'asc' if sortdir > 0 else 'desc')])
                query['sort'].append(sort_dict)

        if req.where:
            where = json.loads(req.where)
            if where:
                query['filter'] = {
                    'term': where
                }

        if req.max_results:
            query['size'] = req.max_results

        if req.page > 1:
            query['from'] = (req.page - 1) * req.max_results

        source_config = config.SOURCES[resource]
        if 'facets' in source_config:
            query['facets'] = source_config['facets']

        try:
            args = self._es_args(resource)
            args['es_fiels'] = self._fields(resource)
            return self._parse_hits(self.es.search(query, **args), resource)
        except es_exceptions.ElasticHttpError:
            return ElasticCursor()

    def find_one(self, resource, **lookup):
        args = self._es_args(resource)
        args['es_fields'] = self._fields(resource)

        if config.ID_FIELD in lookup:
            try:
                hit = self.es.get(id=lookup[config.ID_FIELD], **args)
            except es_exceptions.ElasticHttpNotFoundError:
                return

            if not hit['exists']:
                return

            doc = hit.get('fields', hit.get('_source', {}))
            doc['_id'] = hit.get('_id')
            convert_dates(doc, self._dates(resource))
            return doc
        else:
            query = {
                'query': {
                    'constant_score': {
                        'filter': {
                            'term': lookup
                        }
                    }
                }
            }

            try:
                args['size'] = 1
                docs = self._parse_hits(self.es.search(query, **args), resource)
                return docs.first()
            except es_exceptions.ElasticHttpNotFoundError:
                return None

    def find_list_of_ids(self, resource, ids, client_projection=None):
        args = self._es_args(resource)
        args['es_fields'] = self._fields(resource)
        return self._parse_hits(self.es.multi_get(ids, **args), resource)

    def insert(self, resource, doc_or_docs, **kwargs):
        ids = []
        kwargs.update(self._es_args(resource))
        for doc in doc_or_docs:
            doc.update(self.es.index(doc=doc, id=doc.get('_id'), **kwargs))
            ids.append(doc['_id'])
        self.es.refresh(self.index)
        return ids

    def update(self, resource, id_, updates):
        args = self._es_args(resource, refresh=True)
        return self.es.update(id=id_, doc=updates, **args)

    def replace(self, resource, id_, document):
        args = self._es_args(resource, refresh=True)
        args['overwrite_existing'] = True
        return self.es.index(document=document, id=id_, **args)

    def remove(self, resource, id_=None):
        args = self._es_args(resource, refresh=True)
        if id_:
            return self.es.delete(id=id_, **args)
        else:
            try:
                return self.es.delete_all(**args)
            except es_exceptions.ElasticHttpNotFoundError:
                return

    def _parse_hits(self, hits, resource):
        """Parse hits response into documents."""
        return ElasticCursor(hits, self._dates(resource))

    def _es_args(self, resource, refresh=None):
        """Get index and doctype args."""
        datasource = self._datasource(resource)
        args = {
            'index': self.index,
            'doc_type': datasource[0],
            }
        if refresh:
            args['refresh'] = refresh
        return args

    def _fields(self, resource):
        """Get projection fields for given resource."""
        datasource = self._datasource(resource)
        keys = datasource[2].keys()
        return ','.join(keys)

    def _default_sort(self, resource):
        datasource = self._datasource(resource)
        return datasource[3]

    def _dates(self, resource):
        dates = [config.LAST_UPDATED, config.DATE_CREATED]
        datasource = self._datasource(resource)
        schema = config.DOMAIN[datasource[0]]['schema']
        for field, field_schema in schema.items():
            if field_schema['type'] == 'datetime':
                dates.append(field)
        return dates
예제 #33
0
# -*- coding: utf-8 -*-

import codecs
import json
import re
from pyelasticsearch import ElasticSearch

es = ElasticSearch('http://localhost:9200/')

file = codecs.open('shop_data.json', mode='r', encoding='utf-8')

index = 0

for line in file.readlines():
    data = json.loads(line)
    data.pop("id")
    data["shop_tel"] = re.sub(" +", ",", data["shop_tel"])
    data["shop_tel"] = data["shop_tel"].encode("utf8").replace("电话:", "").split(",")[1:]

    data["location"] = re.sub(" +", ",", data["location"]).split(",")
    data["location"] = data["location"][1] + "," + data["location"][0]

    data["shop_tags"] = re.sub("\(\d+\)", "", data["shop_tags"])
    data["shop_tags"] = re.sub(" +", ",", data["shop_tags"])
    data["shop_tags"] = data["shop_tags"].encode("utf8").replace("分类标签:,", "").split(",")[:-1]

    data["open_time"] = re.sub(" +", "", data["open_time"])
    data["open_time"] = data["open_time"].encode("utf8").replace("营业时间:", "").replace("添加", "").replace("修改", "").replace(":",":")
    index += 1
    es.index('dianping', 'food', data, id=index)
예제 #34
0
        "logo": crawled_data.get("logo"),
        "twitter": crawled_data.get("twitter"),
        "station_site": crawled_data.get("station_site"),
        "primary_genre": crawled_data.get("primary_genre"),
        "frequency": crawled_data.get("frequency"),
        "shoutcast_url": crawled_data.get("shoutcast_url"),
    }

    # TODO: get lat, lon
    if hasattr(settings, 'GEONAMES_USER') and settings.GEONAMES_USER != "demo":
        params = {
            "name_equals": index_data["city"],
            "country": index_data["country"],
            "adminCode1": index_data["state"],
            "maxRows": 10,
            "lang": "en",
            "username": settings.GEONAMES_USER,
            "style": "medium"
        }
        geo_request = requests.get("http://api.geonames.org/searchJSON",
                                   params=params)
        geonames = geo_request.json().get("geonames", [])
        if geonames:
            index_data["location"] = {
                "lat": float(geonames[0]["lat"]),
                "lon": float(geonames[0]["lng"])
            }

    es.index(INDEX_NAME, 'station', index_data, id=crawled_data.get('id'))

print("Bailed after %d failures (pk %d)" % (failures, pk))
예제 #35
0
파일: models.py 프로젝트: SpeeDly/partytask
    def save(self, force_insert=False, force_update=False, **kwargs):
        es = ElasticSearch(ELASTIC_SEARCH_URL)
        if self.id:
            location = self.get_location()
            location_es = "{0},{1}".format(location.y, location.x)
            es.update('glamazer', 'modelresult', 'listings.listing.{0}'.format(self.id),
                script="ctx._source.listing_id = listing;" +
                "ctx._source.artist_id = artist;" +
                "ctx._source.artist_avatar = artist_avatar;" +
                "ctx._source.title = title;" +
                "ctx._source.location = location;" +
                "ctx._source.description = description;" +
                "ctx._source.get_picture = get_picture;" +
                "ctx._source.metadata = metadata;" +
                "ctx._source.price = price;" +
                "ctx._source.likes = likes;" +
                "ctx._source.comments = comments;" +
                "ctx._source.tags = tags;" +
                "ctx._source.status = status;" +
                "ctx._source.style = style;" +
                "ctx._source.rating = rating",
                params={
                    'listing':self.id, 
                    'artist':self.get_artist_id(),
                    'artist_avatar':self.get_artist_avatar(),
                    'title':self.title,
                    'location':location_es,
                    'description':self.description, 
                    'get_picture':self.get_picture(),
                    'metadata':self.metadata,
                    'price':self.price,
                    'likes':self.likes,
                    'comments':self.comments,
                    'tags':self.get_tags(),
                    'status':self.status,
                    'style':self.get_style(),
                    'rating':self.get_rating()
                    })
            super(Listing, self).save(force_insert, force_update)
        else:
            super(Listing, self).save(force_insert, force_update)

            artist_user = self.artist.user
            artist_name = artist_user.first_name
            followers = Followers.objects.select_related().filter(artist=self.artist)
            for follower in followers:
                Notification.objects.create(
                    sender = artist_user,
                    receiver = follower.user,
                    time = current_time(),
                    short_text = NOTIFICATIONS_SHORT[10].format(artist=artist_name),
                    long_text = NOTIFICATIONS_LONG[10].format(artist=artist_name, listing=self.title, user_id=self.artist_id, metadata=self.id),
                )

            location = self.get_location()
            location_es = "{0},{1}".format(location.y, location.x)
            es.index('glamazer', 'modelresult', 
                {
                    'listing_id': self.id,
                    'artist_id': self.artist_id,
                    'artist_avatar':self.get_artist_avatar(),
                    'title': self.title,
                    'location': location_es,
                    'description': self.description,
                    'get_picture': self.get_picture(),
                    'metadata': self.metadata,
                    'price': self.price,
                    'likes': self.likes,
                    'comments':self.comments,
                    'tags': self.get_tags(),
                    'status':self.status,
                    'style':self.get_style(),
                    'rating':self.get_rating()
                }, id='listings.listing.{0}'.format(self.id))
            es.refresh('glamazer')
예제 #36
0
파일: models.py 프로젝트: kahihia/glamfame
    def save(self, force_insert=False, force_update=False, **kwargs):
        es = ElasticSearch(ELASTIC_SEARCH_URL)
        if self.id:
            location = self.get_location()
            location_es = "{0},{1}".format(location.y, location.x)
            es.update(
                'glamazer',
                'modelresult',
                'listings.listing.{0}'.format(self.id),
                script="ctx._source.listing_id = listing;" +
                "ctx._source.artist_id = artist;" +
                "ctx._source.artist_avatar = artist_avatar;" +
                "ctx._source.artist_name = artist_name;" +
                "ctx._source.salon_id = salon;" +
                "ctx._source.salon_avatar = salon_avatar;" +
                "ctx._source.salon_name = salon_name;" +
                "ctx._source.title = title;" +
                "ctx._source.location = location;" +
                "ctx._source.description = description;" +
                "ctx._source.get_picture = get_picture;" +
                "ctx._source.metadata = metadata;" +
                "ctx._source.gender = gender;" + "ctx._source.price = price;" +
                "ctx._source.currency = currency;" +
                "ctx._source.likes = likes;" +
                "ctx._source.comments = comments;" +
                "ctx._source.tags = tags;" + "ctx._source.status = status;" +
                "ctx._source.style = style;" + "ctx._source.rating = rating",
                params={
                    'listing': self.id,
                    'artist': self.get_artist_id(),
                    'artist_avatar': self.get_artist_avatar(),
                    'artist_name': self.get_artist_name(),
                    'salon': self.get_salon_id(),
                    'salon_avatar': self.get_salon_avatar(),
                    'salon_name': self.get_salon_name(),
                    'title': self.title,
                    'location': location_es,
                    'description': self.description,
                    'get_picture': self.get_picture(),
                    'metadata': self.metadata,
                    'gender': self.gender,
                    'price': self.price,
                    'currency': self.currency,
                    'likes': self.likes,
                    'comments': self.comments,
                    'tags': self.get_tags(),
                    'status': self.status,
                    'style': self.get_style(),
                    'rating': self.get_rating()
                })
            super(Listing, self).save(force_insert, force_update)
        else:
            super(Listing, self).save(force_insert, force_update)

            location = self.get_location()
            location_es = "{0},{1}".format(location.y, location.x)
            es.index('glamazer',
                     'modelresult', {
                         'listing_id': self.id,
                         'artist_id': self.artist_id,
                         'artist_avatar': self.get_artist_avatar(),
                         'artist_name': self.get_artist_name(),
                         'salon_id': self.get_salon_id(),
                         'salon_avatar': self.get_salon_avatar(),
                         'salon_name': self.get_salon_name(),
                         'title': self.title,
                         'location': location_es,
                         'description': self.description,
                         'get_picture': self.get_picture(),
                         'metadata': self.metadata,
                         'gender': self.gender,
                         'price': self.price,
                         'currency': self.currency,
                         'likes': self.likes,
                         'comments': self.comments,
                         'tags': self.get_tags(),
                         'status': self.status,
                         'style': self.get_style(),
                         'rating': self.get_rating()
                     },
                     id='listings.listing.{0}'.format(self.id))
            es.refresh('glamazer')
예제 #37
0
import json
from bson import json_util
from pyelasticsearch import ElasticSearch,bulk_chunks
from pymongo import MongoClient

conn = MongoClient() # defaults to localhost
db = conn.sci
tweetsdb = db['focal']

elastic = ElasticSearch('http://localhost:9200')
elastic.delete_all_indexes()

# This would be all around better and faster with the bulk API, 
# but for the life of me I can't make bulk take the 
# json output.
i = 0
for tweet in tweetsdb.find():
	if i % 1000 == 0:
		print i
	elastic.index('db','tweets',json.dumps(tweet,default=json_util.default))
	i += 1

print 'Records written successfully!'
예제 #38
0
class BonqueParseStrategy(ParseStrategy.ParseStrategy):
    def __init__(self):
        self.website = "bonque"
        # elasticsearch binden aan es
        self.es = ElasticSearch('http://localhost:9200/')

    def parseTitel(self, soup):
        titel = soup.head.title.string
        return titel

    def parseWerkgever(self, soup):
        info = soup.find(class_="info")
        p = re.compile(r'<.*?>')
        infoText = p.sub('', str(info))
        p2 = re.compile(r'Werkgever ')
        werkgeverText = p2.sub('', infoText)
        p3 = re.compile(r'Locatie.*')
        werkgever = p3.sub('', werkgeverText)
        werkgever = werkgever.strip()
        return werkgever

    def parseLocatie(self, soup):
        info = soup.find(class_="info")
        p = re.compile(r'<.*?>')
        infoText = p.sub('', str(info))
        p2 = re.compile(r'Werkgever ')
        werkgeverText = p2.sub('', infoText)
        p4 = re.compile(r'(?s).*?Locatie ')
        locatie = p4.sub('', werkgeverText)
        locatie = locatie.strip()
        return locatie

    def parseInhoud(self, soup):
        body = soup.findAll('p')
        inhoud = ""
        for i in body:
            text = i.text
            text = re.sub('\'', '', text)
            text = text.strip()
            inhoud = inhoud + text.encode('utf8')
        return inhoud

    def parse(self, websiteUrl):
        soup = self.getSoup(websiteUrl)

        #parsen
        titel = self.parseTitel(soup)
        werkgever = self.parseWerkgever(soup)
        locatie = self.parseLocatie(soup)
        inhoud = self.parseInhoud(soup)
        websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl)
        datum = time.strftime("%d-%m-%Y")
        #make id (str)
        id = self.website + "-" + re.sub(r'\W+', '', titel)

        # make document for elasticsearch db
        document = self.makeDocument(id, titel, websiteUrl, self.website, datum, werkgever, locatie, "-", inhoud)

        #indexeren (stoppen) van vacaturen in esDb
        self.es.index('vacature-index', 'vacature', document, id=document['id'])
        print "Es: " + titel
예제 #39
0
            os.chmod(newfile, 0644)
            filestring = 'Attachment: <a href="%sattachments/%s">%s</a><br>\n' % (baseurl, newshort, newshort)
            load += filestring
else:
    load = msg.get_payload()

payload = load.rstrip("\n")
# attachment strings aren't sanitized, so we're doing this to keep them from breaking anything
payload = payload.replace("'", "&#039;")

# write the message to elasticsearch
if not debugmode:
    x = {'submitted': subdate, 'sent': msgdate, 'from': msgfrom, 'subject': msgsubject, 'body': payload}
    es = ElasticSearch(elasticsearch)
    print json.dumps(x)
    es.index("oplog", queue, x)

if txtlog == 'T':
    # write the message to the text log
    # we're reconstructing variables here because the formatting is different
    logsubject = msg['Subject']
    logfrom = msg['From'].replace("'", "")
    logpayload = load.rstrip("\n")
    logrecord = """System  date: %s\n
Message date: %s\n
Message from: %s\n
Message subject: %s\n
Message body:\n
%s\n\n
<----------------END MESSAGE ------------------------>\n\n""" % (recdate, msgdate, logfrom, logsubject, logpayload)
    degbu(logrecord)
예제 #40
0
class Worker(Process):
    def __init__(self, queue, number=-1):
        self.__queue = queue
        self.number = number
        self.name = "index_plays worker #%d" % number
        self.es = ElasticSearch(settings.ES_URL)
        Process.__init__(self)

    def parse_metadata(self, metadata):
        stream_info = metadata.split(";")
        for info in metadata.split(";"):
            key = info[:info.index('=')]
            value = info[info.index('=') + 1:]

            if key == 'StreamTitle':
                info = value[1:-1]

    def run(self):
        while 1:
            item = self.__queue.get()
            if item is None:
                break

            station_id, shoutcast_url, last_playing, last_playing_time = item
            try:
                r = requests.get(shoutcast_url,
                                 headers={'Icy-Metadata': '1'},
                                 stream=True)
            except requests.exceptions.ConnectionError:
                continue

            # Parse the headers
            headers = {}
            line = ""
            for content in r.iter_content():
                line += content
                if line[-2:] == '\r\n':
                    # Line ended
                    if ":" in line:
                        key = line[:line.index(":")]
                        value = line[line.index(":") + 1:-2]
                        headers[key] = value

                    if len(line) == 2:
                        break
                    line = ""

            # We really need the metaint, so that we know where to look for metadata
            if 'icy-metaint' not in headers:
                print("No icy-metaint!")
                continue

            metaint = int(headers.get('icy-metaint'))

            data = r.raw.read(metaint)
            length = r.raw.read(1)
            if len(length) != 1:
                continue  # It seems like sometimes it's getting stuck here.

            length = struct.unpack('B', length)[0]
            metadata = r.raw.read(length * 16)
            r.close()
            # Now we've got the metadata string!
            if metadata != last_playing:
                stream_info = metadata.split(";")
                for info in metadata.split(";"):
                    try:
                        split_index = info.index('=')
                    except ValueError:
                        continue

                    key = info[:split_index]
                    value = info[split_index + 1:]

                    if key == 'StreamTitle':
                        s = StreamTitle(value)
                        if s.is_song():
                            print("[worker %s] %s : %s" %
                                  (self.number, s.description,
                                   s.data.get('text')))
                            doc = s.data
                            doc['description'] = s.description
                            self.es.index(settings.ES_INDEX,
                                          'play',
                                          doc,
                                          parent=station_id)
                            last_playing_time = datetime.datetime.now()

            if last_playing_time is None:
                last_playing_time = datetime.datetime.now()

            # A valid station should be playing a song at least once every 30 minutes.
            if last_playing_time > (datetime.datetime.now() -
                                    datetime.timedelta(minutes=30)):
                # Send it around again....
                self.__queue.put(
                    (station_id, shoutcast_url, metadata, last_playing_time))
예제 #41
0
파일: lbrest.py 프로젝트: lightbase/LBIndex
class LBRest():

    def __init__(self, base=None, idx_exp_url=None, 
                 txt_mapping=None, cfg_idx=None):
        """Serve para cosumir o LBG e o ES."""

        self.base = base
        self.idx_exp_url = idx_exp_url
        if self.idx_exp_url is not None:
            self.idx_exp_host = idx_exp_url.split('/')[2]
            self.idx_exp_index = idx_exp_url.split('/')[3]
            self.idx_exp_type = idx_exp_url.split('/')[4]
            self.es = ElasticSearch("http://" + self.idx_exp_host)
        self.txt_mapping = txt_mapping
        self.cfg_idx = cfg_idx
        self.con_refsd = False

    def get_index(self, bases_list):
        """Obter a a configuração de indexação p/ as bases."""

        bases_indexes = []
        for base in bases_list:
            idx_exp_url = base['metadata']['idx_exp_url']
            nm_idx = idx_exp_url.split('/')[3]
            url_txt_idx = config.REST_URL + "/_txt_idx/" + nm_idx
            req = None
            try:
                req = requests.get(url_txt_idx)
                req.raise_for_status()
                idx_resp = req.json()
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 404:

                    # NOTE: Para os casos onde não há configuração de 
                    # indexação setada na rota "_txt_idx"! By Questor
                    idx_resp = None
                else:
                    fail_content = None
                    if req is not None:
                        fail_content = req._content
                    else:
                        fail_content = str(e)
                    logger.error("Falha HTTP ao tentar obter configuração de "\
                    "índice textual! URL: %s. FALHA: %s" % 
                    (config.REST_URL, fail_content))
                    return []
            except Exception as e:
                fail_content = None
                if req is not None:
                    fail_content = req._content
                else:
                    fail_content = str(e)
                logger.error("Erro ao tentar obter a configuração de índice "\
                "textual! URL: %s. FALHA: %s" % 
                (config.REST_URL, fail_content))
                return []
            bases_indexes.append({"base": base, "index": idx_resp})
        return bases_indexes

    def get_bases(self):
        """Get all bases which has to index registries."""

        # NOTE: A construção logo abaixo tá meio tosca. O objetivo é
        # checar se na estrutura de dados da table "lb_base" já está 
        # o atributo (campo struct) e o campo "txt_mapping". Se não 
        # tiver, tenta obter a base com todos os campos. Trata-se de 
        # um "workaround" sendo o correto que a estrutura de dados 
        # na table "lb_base" esteja atualizada! By Questor
        bases = [ ]
        req = None
        try:
            params = """{
                "select": [
                    "name",
                    "idx_exp_time",
                    "idx_exp_url",
                    "txt_mapping"
                ],
                "literal": "idx_exp is true",
                "limit": null
            }"""
            req = requests.get(config.REST_URL, params={'$$':params})
            if config.FORCE_INDEX == True:
                data = [ ]
                results = dict({
                    u'metadata' : {
                        u'idx_exp_url'  : u''+config.ES_URL+'',
                        u'name'         : u''+config.NM_BASE+'',
                        u'idx_exp_time' : u''+config.TIME_IDX+''
                    }
                })
                data.append(results)
                bases = data
            else:
                req.raise_for_status()
                response = req.json()
                bases = response["results"]
        except Exception as e:
            bases = [ ]
            req = None
            try:
                params = """{
                    "literal": "idx_exp is true",
                    "limit": null
                }"""
                req = requests.get(config.REST_URL, params={'$$':params})
                req.raise_for_status()
                response = req.json()
                bases = response["results"]
            except Exception as e:
                # NOTE: A variável de instância "self.con_refsd" 
                # serve p/ evitar que o aviso mais abaixo seja 
                # exibido repetidamente detonando o log! By Questor
                if self.con_refsd:
                    return bases

                # NOTE: Estou usando '"Connection refused" in str(e)' 
                # pq "raise_for_status()" mais acima não retorna uma 
                # exceção do tipo "requests.exceptions.HTTPError" de 
                # forma q possamos usar o código em "status_code" 
                # tratar erro de forma mais específica! By Questor
                if "Connection refused" in str(e) and not self.con_refsd:
                    logger.error('Erro ao obter a lista bases para '\
                    'indexação. URL: %s. FALHA: Servidor indisponivel! '\
                    'HTTPCode: 502 (Connection refused)!' % (config.REST_URL))
                    self.con_refsd = True
                    return bases
                self.con_refsd = False
                fail_content = None
                if req is not None:
                    fail_content = req._content
                else:
                    fail_content = str(e)
                logger.error(
                    ("Erro ao obter a lista bases para indexação. "
                        "URL: %s. FALHA: %s") % (
                        config.REST_URL, 
                        fail_content))
        return bases

    def get_passed_registries(self):
        """Retorna registros da base de log erros de indexação. 
        Apenas "id_doc_orig" e "dt_last_up_orig".
        """

        # NOTE: Cria base de log se não existir! By Questor
        self.create_log_base()

        registries = [ ]
        params = {'$$':"""{
            "select":["id_doc_orig", "dt_last_up_orig"],
            "literal": "nm_base = '%s'",
            "limit": null
            }""" % self.base}
        url = config.REST_URL + '/log_lbindex/doc'

        req = None
        try:
            req = requests.get(url, params=params)
            req.raise_for_status()
            response = req.json()
            registries = response["results"]
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                1 Erro ao recuperar registros da base %s'. FALHA: %s
            """ % ('log_lbindex', fail_content))

        resp = {}
        for reg in registries:
            resp[reg['id_doc_orig']] = reg['dt_last_up_orig']
        return resp

    def get_registries(self):
        """Retorna registros à serem indexados que sob certos critérios não 
        tenham falhado no passado.
        """

        # NOTE: Obtêm registros da base de log de erros! Registros 
        # q tenham falhado no passado! By Questor
        registries = [ ]
        if config.FORCE_INDEX:
            params = {'$$':'{"select":["id_doc", "dt_last_up"], "limit": %d}'}
        else:
            params = {
                '$$':'{"select":["id_doc", "dt_last_up"], \
                "literal":"dt_idx is null", "limit": %d}'
            }

        params.update(result_count='false')
        params['$$'] = params['$$'] % config.DEFAULT_LIMIT

        url = config.REST_URL + '/' + self.base + '/doc'

        req = None
        try:
            req = requests.get(url, params=params)
            req.raise_for_status()
            response = req.json()
            registries = response["results"]
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                Erro ao recuperar registros da base %s'. FALHA: %s
            """ % (self.base, fail_content))

        '''
        TODO: Essa lógica poderia ser mais eficiente... A 
        princípio vejo duas soluções...
        1 - Guardar em cache (mais complicada);
        2 - Trazer apenas os registros (id_doc) envolvidos 
        no processo de indexação atual.
        By Questor
        '''

        '''
        TODO: Esse método "self.get_passed_registries()" deveria 
        ser chamado sempre? Mesmo quando a operação é "create"? 
        Checar melhor... By Questor
        '''

        # NOTE: Obtêm registros da base de log de erros! Registros 
        # q tenham falhado no passado! By Questor
        passed = self.get_passed_registries()

        _registries = [ ]
        for reg in registries:
            if reg['_metadata']['id_doc'] in passed:
                '''
                NOTE: O objetivo aqui é checar se o registro 
                está no log de erros (registros que tentou-se 
                indexar no passado) e se estiver ignora-os a 
                não ser que a data de "update" do registro 
                registrado na base de logs seja diferente da 
                data atual do registro, nesses casos o LBIndex 
                vai tentar novamente!
                By Questor
                '''

                '''
                NOTE: No dict "passed" consta apenas o valor 
                do campo "dt_last_up_orig" da base "log_lbindex"! 
                By Questor
                '''
                dt_last_up = passed[reg['_metadata']['id_doc']]

                if dt_last_up != reg['_metadata']['dt_last_up']:
                    _registries.append(reg)
            else:
                _registries.append(reg)

        return _registries

    def get_full_reg(self, id, dt_last_up):
        """Obtêm o registro doc mais textos extraídos dos arquivos anexos se 
        houverem.
        """

        # TODO: Registrar essa ação no log toda "santa vez"? By Questor
        logger.info('Recuperando registro %s da base %s ...' % 
            (str(id), self.base))

        response = None
        url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/full'

        req = None
        try:
            req = requests.get(url)
            req.raise_for_status()
            response = req.json()
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            error_msg = """
                Erro ao recuperar registro %s na base %s'. FALHA: %s
            """ % (str(id), self.base, fail_content)

            # TODO: Pq duas chamadas as logs? By Questor
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
        return response

    def es_create_mapping(self):
        """Cria um mapping p/ uma base se houver configuração p/ isso."""

        response_0 = None
        response_0_json = None
        index_url = None
        try:
            index_url = ("http://" + self.idx_exp_host + "/" + 
                self.idx_exp_index + "/" + self.idx_exp_type)
            response_0 = requests.get(index_url + "/_mapping")
            response_0.raise_for_status()
            response_0_json = response_0.json()
        except requests.exceptions.HTTPError as e:

            # NOTE: Normalmente entrará nesse bloco de código 
            # quando o índice não existe! By Questor
            self.es_create_index()
        except requests.exceptions.RequestException as e:
            raise Exception("Problem in the mapping provider! " + str(e))
        except Exception as e:
            raise Exception("Mapping operation. Program error! " + str(e))

        if (response_0.status_code == 200 and not response_0_json and 
                (self.txt_mapping is not None and self.txt_mapping)):
            response_1 = None
            try:
                response_1 = self.es.put_mapping(
                    index=self.idx_exp_index,
                    doc_type=self.idx_exp_type,
                    mapping=self.txt_mapping)

                if (response_1 is None or
                        response_1.get("acknowledged", None) is None or
                        response_1.get("acknowledged", None) != True):
                    raise Exception("Retorno inesperado do servidor \
                        ao criar mapping! " + 
                        str(response_1))
            except Exception as e:
                raise Exception("Mapping creation error! " + str(e))

    def es_create_index(self):
        """Criar um índice p/ a base com as configurações setadas, não havendo 
        criar um índice genérico.
        """

        response_0 = None
        try:
            cfg_idx_holder = None

            # NOTE: Se não houver configuração de indexação "setada" 
            # o sistema vai criar uma padrão! By Questor
            if self.cfg_idx is not None and self.cfg_idx:
                cfg_idx_holder = self.cfg_idx
            else:
                cfg_idx_holder = {
                        "settings":{
                            "analysis":{
                                "analyzer":{
                                    "default":{
                                        "tokenizer":"standard",
                                        "filter":[
                                            "lowercase",
                                            "asciifolding"
                                        ]
                                    }
                                }
                            }
                        }
                    }

            response_0 = self.es.create_index(index=self.idx_exp_index,
                                              settings=cfg_idx_holder)

            if (response_0 is None or
                response_0.get("acknowledged", None) is None or
                response_0.get("acknowledged", None) != True):
                raise Exception("Retorno inesperado do servidor \
                    ao criar index! " + 
                    str(response_0))

            self.es_create_mapping()
        except IndexAlreadyExistsError as e:
            self.es_create_mapping()
        except Exception as e:
            raise Exception("Index creation error! " + str(e))

    def index_member(self, registry, id, dt_last_up):
        """Criar o índice textual para cada registro."""

        logger.info(
            'Indexando registro %s da base %s na url %s ...' % (
                str(id), 
                self.base, self.idx_exp_url))

        try:

            # NOTE: Trata e cria os mappings e index textuais! 
            # By Questor
            self.es_create_mapping()
            self.es.index(self.idx_exp_index, self.idx_exp_type, 
                          registry, id=id)
            return True
        except Exception as e:
            error_msg = ("Erro ao indexar registro %s da base %s na url %s'. "
                "Mensagem de erro: %s") % (
                str(id), 
                self.base, self.idx_exp_url, str(e))
            logger.error(error_msg)

            # TODO: Pq dois logs? By Questor
            self.write_error(id, dt_last_up, error_msg)
            return False

    def update_dt_index(self, id, dt_last_up):
        """Atualizar a data de atualização da indexação textual do registro."""

        logger.info('Alterando data de indexacao do '\
            'registro %s da base %s ...' % (str(id), self.base))
        params = {'value': datetime.datetime.now().\
            strftime('%d/%m/%Y %H:%M:%S')}
        url = (config.REST_URL + '/' + self.base + '/doc/' + str(id) + 
            '/_metadata/dt_idx')

        req = None
        try:
            req = requests.put(url, params=params)
            req.raise_for_status()
            return True
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            error_msg = 'Erro ao alterar data de indexacao do registro %s na '\
                'base %s. FALHA: %s' % (str(id), self.base, fail_content)
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
        return False

    def write_error(self, id_doc, dt_last_up, error_msg):
        """Write errors to LightBase."""

        error = {
            'nm_base': self.base,
            'id_doc_orig': id_doc,
            'error_msg': error_msg,
            'dt_error': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'),
            'dt_last_up_orig': dt_last_up
        }
        url = config.REST_URL + '/log_lbindex/doc'
        data = {'value': json.dumps(error)}
        req = None
        try:
            req = requests.post(url, data=data)
            req.raise_for_status()
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                0 Erro ao tentar escrever erro no Lightbase. FALHA: %s
            """ % fail_content)

    def get_errors(self):
        """Get all bases which has to index registries."""

        errors = [ ]
        params = """{
            "literal": "base = '%s'",
            "limit": 250
        }""" % (self.base)
        url = config.REST_URL + '/_index_error'

        req = None
        try:
            req = requests.get(url, params={'$$':params})
            req.raise_for_status()
            response = req.json()
            errors = response["results"]
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            logger.error("""
                Erro ao tentar recuperar erros de indice. URL: %s. FALHA: %s
            """ % (url, fail_content))
        return errors

    # TODO: Esse método serve para criar/atualizar p/ uma 
    # indexação (index) padrão! No momento está "desvirtuado", 
    # pois basta apagar o índice p/ q ele seja recriado com a 
    # indexação setada na rota "_txt_idx"! Creio que esse 
    # método não faz muito sentido aqui. Sugiro remover! 
    # By Questor
    def create_index(self):
        """Cria índice com as opções de mapeamento padrão
        Atualiza o índice se já estiver criado.
        """

        settings = {
            "settings":{
                "analysis":{
                    "analyzer":{
                        "default":{
                            "tokenizer":"standard",
                            "filter":[
                                "lowercase",
                                "asciifolding"
                            ]
                        }
                    }
                }
            }
        }

        http, space, address, _index, _type = self.idx_exp_url.split('/')

        try:
            result = self.es.create_index(
                index=_index,
                settings=settings
            )
        except IndexAlreadyExistsError as e:
            logger.info("O índice já existe. Tentando atualizar o mapping...")
            self.es.close_index(index=_index)
            result = self.es.update_settings(
                index=_index,
                settings=settings
            )
            logger.info("Mapping atualizado com sucesso. Abrindo o índice...")
            self.es.open_index(index=_index)
            logger.info("Índice reaberto com sucesso!")

    def delete_index(self, registry):
        """Deletar registros no index."""

        id = registry['id_doc']
        try:
            http, space, address, _index, _type = self.idx_exp_url.split('/')
            self.es.delete(_index, _type, id=id)
            return True

        except ElasticHttpNotFoundError as e:
            return True

        except Exception as e:
            error_msg = 'Erro ao deletar indice %s da base %s na url %s. '\
                'Mensagem de erro: %s' % \
                (str(id), self.base, self.idx_exp_url, str(e))
            logger.error(error_msg)
            return False

    def delete_error(self, registry):
        """Deletar registro de erros na rota '_index_error'."""

        url = (config.REST_URL + 
            """/_index_error?$$={"literal":"base = '%s' and id_doc = %d"}""")
        url = url % (registry['base'], registry['id_doc'])
        logger.info('Deletando registro de erro de indice na url %s' % url)

        req = None
        try:
            req = requests.delete(url)
            req.raise_for_status()
            return True
        except Exception as e:
            fail_content = None
            if req is not None:
                fail_content = req._content
            else:
                fail_content = str(e)
            error_msg = """
                Erro ao deletar erro de indice. FALHA: %s
            """ % (fail_content)
            logger.error(error_msg)
        return False

    @staticmethod
    def create_log_base():
        """Cria base de log do LBIndex caso não exista."""

        log_base = model.LogBase()
        response = log_base.get_base()
        if not response:

            # NOTE: Cria a base já que ela não existe!
            logger.info("Criando base de log do índice...")
            result = log_base.create_base()
            if result is None:
                logger.error("Erro na criação da base de log: \n%s", 
                             response.text)
                return False
            else:
                logger.info("Base de log criada com sucesso!")
        return True
예제 #42
0
파일: lbrest.py 프로젝트: lightbase/LBIndex
class LBRest():

    def __init__(self, base=None, idx_exp_url=None):
        self.base = base
        self.idx_exp_url = idx_exp_url
        if self.idx_exp_url is not None:
            http, space, address, _index, _type = self.idx_exp_url.split('/')
            self.es = ElasticSearch('/'.join([http, space, address]))

    def get_bases(self):
        """ Get all bases which has to index registries
        """
        bases = [ ]
        params = """{
            "select": [
                "name",
                "idx_exp_time",
                "idx_exp_url"
            ],
            "literal": "idx_exp is true",
            "limit": null
        }"""
        req = requests.get(config.REST_URL, params={'$$':params})
        try:
            req.raise_for_status()
            response = req.json()
            bases = response["results"]
        except:
            logger.error("""
                Erro ao tentar recuperar bases. url: %s. Reposta: %s
            """ % (config.REST_URL, req._content))
        return bases

    def get_passed_registries(self):
        """
        Realiza leitura da base de log de indexação
        """
        # Cria base de log se não existir
        self.create_log_base()
        registries = [ ]
        params = {'$$':"""{
            "select":["id_doc_orig", "dt_last_up_orig"],
            "literal": "nm_base = '%s'",
            "limit": null
            }""" % self.base }
        url = config.REST_URL + '/log_lbindex/doc'
        req = requests.get(url, params=params)
        try:
            req.raise_for_status()
            response = req.json()
            registries = response["results"]
        except:
            logger.error("""
                Erro ao recuperar registros da base %s'. Resposta: %s
            """ % ('log_lbindex', req._content))


        resp = {} 
        for reg in registries:
            resp[reg['id_doc_orig']] = reg['dt_last_up_orig']
        return resp
        #return {reg['id_doc_orig']: reg['dt_last_up_orig'] for reg in registries}
        
    def get_registries(self):
        """Função que lista todos os registros a serem indexados"""
        registries = [ ]
        if config.FORCE_INDEX:
            params = {'$$':'{"select":["id_doc", "dt_last_up"], "limit": %d}'}
        else:
            params = {'$$':'{"select":["id_doc", "dt_last_up"],"literal":"dt_idx is null", "limit": %d}'}

        params.update(result_count='false')
        params['$$'] = params['$$'] % config.DEFAULT_LIMIT

        url = config.REST_URL + '/' + self.base + '/doc'
        req = requests.get(url, params=params)
        try:
            req.raise_for_status()
            response = req.json()
            registries = response["results"]
        except:
            logger.error("""
                Erro ao recuperar registros da base %s'. Resposta: %s
            """ % (self.base, req._content))





        # Erro ao recuperar registros da base docs_pro'. Resposta: {"status": 500, 
        # "request": {"path": "/api/docs_pro/doc", "client_addr": "10.72.246.21", 
        #         "user_agent": "python-requests/2.3.0 CPython/2.6.6 Linux/2.6.32-504.el6.x86_64", 
        #         "method": "GET"}, "error_message": "SearchError: (OperationalError) could not 
        # connect to server: No route to host\n\tIs the server running on host \"10.72.247.144\" 
        # and accepting\n\tTCP/IP connections on port 5432?\n None None", "type": "Exception"}





        passed = self.get_passed_registries()
        _registries = [ ]
        for reg in registries:
            if reg['_metadata']['id_doc'] in passed:
                dt_last_up = passed[reg['_metadata']['id_doc']]
                if dt_last_up != reg['_metadata']['dt_last_up']:
                    _registries.append(reg)
            else:
                _registries.append(reg)

        return _registries

    def get_full_reg(self, id, dt_last_up):
        logger.info('Recuperando registro %s da base %s ...' % (str(id), self.base))
        response = None
        url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/full'
        req = requests.get(url)
        try:
            req.raise_for_status()
            response = req.json()
        except:
            error_msg = """
                Erro ao recuperar registro %s na base %s'. Resposta: %s
            """ % (str(id), self.base, req._content)
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
        return response

    def index_member(self, registry, id, dt_last_up):
        logger.info('Indexando registro %s da base %s na url %s ...' % (str(id), self.base, self.idx_exp_url))
        try:

            http, space, address, _index, _type = self.idx_exp_url.split('/')
            self.es.index(_index, _type, registry, id=id)
            return True

        except Exception as e:
            error_msg = """
                Erro ao indexar registro %s da base %s na url %s'. Mensagem de erro: %s
            """ % (str(id), self.base, self.idx_exp_url, str(e))
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
            return False

    def update_dt_index(self, id, dt_last_up):
        logger.info('Alterando data de indexacao do registro %s da base %s ...' % (str(id), self.base))
        params = {'value': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')}
        url = config.REST_URL + '/' + self.base + '/doc/' + str(id) + '/_metadata/dt_idx'
        req = requests.put(url, params=params)
        try:
            req.raise_for_status()
            return True
        except:
            error_msg = """
                Erro ao alterar data de indexacao do registro %s na base %s'. Resposta: %s
            """ % (str(id), self.base, req._content)
            logger.error(error_msg)
            self.write_error(id, dt_last_up, error_msg)
        return False

    def write_error(self, id_doc, dt_last_up, error_msg):
        """ Write errors to LightBase
        """
        error = {
            'nm_base': self.base,
            'id_doc_orig': id_doc,
            'error_msg': error_msg,
            'dt_error': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'),
            'dt_last_up_orig': dt_last_up
        }
        url = config.REST_URL + '/log_lbindex/doc'
        data = {'value': json.dumps(error)}
        req = requests.post(url, data=data)
        try:
            req.raise_for_status()
        except:
            logger.error("""
                Erro ao tentar escrever erro no Lightbase. Reposta: %s
            """ % req._content)

    def get_errors(self):
        """ Get all bases which has to index registries
        """
        errors = [ ]
        params = """{
            "literal": "base = '%s'",
            "limit": 250
        }""" % (self.base)
        url = config.REST_URL + '/_index_error'
        req = requests.get(url, params={'$$':params})
        try:
            req.raise_for_status()
            response = req.json()
            errors = response["results"]
        except:
            logger.error("""
                Erro ao tentar recuperar erros de indice. url: %s. Reposta: %s
            """ % (url, req._content))
        return errors

    def create_index(self):
        """
        Cria índice com as opções de mapeamento padrão
        Atualiza o índice se já estiver criado
        """
        settings = {
            "settings": {
                # "number_of_shards": "5",
                # "number_of_replicas": "1",
                "analysis.analyzer.default.filter.0": "lowercase",
                "analysis.analyzer.default.filter.1": "asciifolding",
                "analysis.analyzer.default.tokenizer": "standard",
                "analysis.analyzer.default.type": "custom",
                "analysis.filter.pt_stemmer.type": "stemmer",
                "analysis.filter.pt_stemmer.name": "portuguese"
            },
            "mappings": {
                "document": {
                    "_timestamp": {
                        "enabled": "true"
                    }
                }
            }
        }

        http, space, address, _index, _type = self.idx_exp_url.split('/')
        try:
            result = self.es.create_index(
                index=_index,
                settings=settings
            )
        except IndexAlreadyExistsError as e:
            logger.info("O índice já existe. Tentando atualizar o mapping...")
            self.es.close_index(index=_index)
            result = self.es.update_settings(
                index=_index,
                settings=settings
            )
            logger.info("Mapping atualizado com sucesso. Abrindo o índice...")
            self.es.open_index(index=_index)
            logger.info("Índice reaberto com sucesso!")

    def delete_index(self, registry):
        id = registry['id_doc']
        try:
            http, space, address, _index, _type = self.idx_exp_url.split('/')
            self.es.delete(_index, _type, id=id)
            return True

        except ElasticHttpNotFoundError as e:
            return True

        except Exception as e:
            error_msg = """
                Erro ao deletar indice %s da base %s na url %s'. Mensagem de erro: %s
            """ % (str(id), self.base, self.idx_exp_url, str(e))
            logger.error(error_msg)
            return False

    def delete_error(self, registry):
        url = config.REST_URL + """/_index_error?$$={"literal":"base = '%s' and id_doc = %d"}"""
        url = url % (registry['base'], registry['id_doc'])
        logger.info('Deletando registro de erro de indice na url %s' % url)
        req = requests.delete(url)
        try:
            req.raise_for_status()
            return True
        except:
            error_msg = """
                Erro ao deletar erro de indice. Resposta: %s
            """ % (req._content)
            logger.error(error_msg)
        return False

    @staticmethod
    def create_log_base():
        """
        Cria base de log do índice caso não exista
        """
        log_base = model.LogBase()
        response = log_base.get_base()
        if not response:
            # Cria a base já que ela não existe
            logger.info("Criando base de log do índice...")
            result = log_base.create_base()
            if result is None:
                logger.error("Erro na criação da base de log: \n%s", response.text)
                return False
            else:
                logger.info("Base de log criada com sucesso!")

        return True
예제 #43
0
        return True
    except ValueError:
        return False


def isflt(string):
    try:
        float(string)
        return True
    except ValueError:
        return False


for item in root.findall('artikel'):
    list.append(articles)
    articles = {}
    for subitem in item:
        if subitem.text is not None:
            if isint(subitem.text):
                articles[subitem.tag] = int(subitem.text)
            elif isflt(subitem.text):
                articles[subitem.tag] = float(subitem.text)
            else:
                articles[subitem.tag] = subitem.text

for i, article in enumerate(list):
    if len(article) == 0:
        print "Empty value found"
    else:
        print es.index('articles3', 'article', article, id=i)
class TestClient(unittest.TestCase):
    def setUp(self):
        super(TestClient, self).setUp()
        docs = []

        self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks')
    def test_datetime_ranges(self, _mock):
        "Test datetime ranges get converted to dates."
        client = self._make_one()
        start = datetime.datetime(2012, 1, 1, 12, 34, 56)
        end = datetime.datetime(2012, 1, 31, 12, 34, 56)
        list(client('downloads_count', start, end, interval='week'))
        self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1))
        assert not isinstance(_mock.call_args[0][0], datetime.datetime)
        self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31))
        assert not isinstance(_mock.call_args[0][1], datetime.datetime)

    def test_date_order(self):
        # Ensure fill doesn't change date ordering.
        client = self._make_one()
        prev_date = datetime.date(2000, 1, 1)

        # Addon 1 doesn't have downloads for every month and the client will
        # fill zeroes for the missing dates.
        hits = list(
            client('downloads_count',
                   START,
                   '2012-05-01',
                   interval='month',
                   add_on='1'))
        for hit in hits:
            d = hit['date']
            assert prev_date < d
            prev_date = d
예제 #45
0
		"description": crawled_data.get("description"),
		"logo": crawled_data.get("logo"),
		"twitter": crawled_data.get("twitter"),
		"station_site": crawled_data.get("station_site"),
		"primary_genre": crawled_data.get("primary_genre"),
		"frequency": crawled_data.get("frequency"),
		"shoutcast_url": crawled_data.get("shoutcast_url"),
	}

	# TODO: get lat, lon
	if hasattr(settings, 'GEONAMES_USER') and settings.GEONAMES_USER != "demo":
		params = {
			"name_equals": index_data["city"],
			"country": index_data["country"],
			"adminCode1": index_data["state"],
			"maxRows": 10,
			"lang": "en",
			"username": settings.GEONAMES_USER,
			"style": "medium"
		}
		geo_request = requests.get("http://api.geonames.org/searchJSON", params=params)
		geonames = geo_request.json().get("geonames", [])
		if geonames:
			index_data["location"] = {
				"lat": float(geonames[0]["lat"]),
				"lon": float(geonames[0]["lng"])
			}

	es.index(INDEX_NAME, 'station', index_data, id=crawled_data.get('id'))

print("Bailed after %d failures (pk %d)" % (failures, pk))
예제 #46
0
def get_netranges(starting_ip='1.0.0.0',
                  last_ip='2.0.0.0',
                  elastic_search_url='http://127.0.0.1:9200/',
                  index_name='netblocks',
                  doc_name='netblock',
                  sleep_min=1,
                  sleep_max=5):
    connection = ElasticSearch(elastic_search_url)
    current_ip = starting_ip

    while True:
        # See if we've finished the range of work
        if ip2long(current_ip) > ip2long(last_ip):
            return

        current_ip = get_next_undefined_address(current_ip)

        if current_ip == None:  # No more undefined ip addresses
            return

        print current_ip

        try:
            whois_resp = IPWhois(current_ip).lookup_rws()
        except Exception as error:
            """
            If a message like: 'STDERR: getaddrinfo(whois.apnic.net): Name or
            service not known' appears' then print it out and try the next
            IP address.
            """
            print type(error), error
            current_ip = get_next_ip(current_ip)

            if current_ip is None:
                return  # No more undefined ip addresses
            gevent.sleep(randint(sleep_min, sleep_max))
            continue

        if 'asn_cidr' in whois_resp and \
            whois_resp['asn_cidr'] is not None and \
            whois_resp['asn_cidr'].count('.') == 3:
            last_netrange_ip = get_netrange_end(whois_resp['asn_cidr'])
        else:
            try:
                last_netrange_ip = \
                    whois_resp['nets'][0]['range'].split('-')[-1].strip()
                assert last_netrange_ip.count('.') == 3
            except:
                # No match found for n + 192.0.1.0.
                print 'Missing ASN CIDR in whois resp: %s' % whois_resp
                current_ip = get_next_ip(current_ip)

                if current_ip is None:
                    return  # No more undefined ip addresses

                gevent.sleep(randint(sleep_min, sleep_max))
                continue

        assert last_netrange_ip is not None and \
               last_netrange_ip.count('.') == 3, \
               'Unable to find last netrange ip for %s: %s' % (current_ip,
                                                               whois_resp)

        # Save current_ip and whois_resp
        entry = {
            'netblock_start': current_ip,
            'netblock_end': last_netrange_ip,
            'block_size': ip2long(last_netrange_ip) - ip2long(current_ip) + 1,
            'whois': json.dumps(whois_resp),
        }

        keys = ('cidr', 'name', 'handle', 'range', 'description', 'country',
                'state', 'city', 'address', 'postal_code', 'abuse_emails',
                'tech_emails', 'misc_emails', 'created', 'updated')

        for _key in keys:
            entry[_key] = str(whois_resp['nets'][0][_key]) \
                          if _key in whois_resp['nets'][0] and \
                             whois_resp['nets'][0][_key] else None

            if _key == 'city' and entry[_key] and ' ' in entry[_key]:
                entry[_key] = entry[_key].replace(' ', '_')

        try:
            connection.index(index_name, doc_name, entry)
        except ElasticHttpError, error:
            print 'At %s. Unable to save record: %s' % (current_ip, entry)
            raise error

        current_ip = get_next_ip(last_netrange_ip)

        if current_ip is None:
            return  # No more undefined ip addresses

        gevent.sleep(randint(sleep_min, sleep_max))
예제 #47
0
class ItvacaturesParseStrategy(ParseStrategy.ParseStrategy):
    def __init__(self):
        self.website = "it-vacatures"
        # elasticsearch binden aan es
        self.es = ElasticSearch('http://localhost:9200/')

    def parseTitel(self, soup):
        titel = soup.head.title.string
        return titel

    def parseWerkgever(self, soup):
        info = soup.find("td")
        infoTwee = info.find_next_sibling()
        p = re.compile(r'<.*?>')
        werkgever = p.sub('', str(infoTwee))
        return werkgever

    def parseLocatie(self, soup):
        info = soup.find("td")
        infoTwee = info.find_next_sibling()
        locatieEen = infoTwee.find_next()
        p = re.compile(r'<.*?>')
        locatieTwee = p.sub('', str(locatieEen))
        p = re.compile(r'Locatie')
        locatie = p.sub('', str(locatieTwee))
        locatie = locatie.strip()
        return locatie

    def parseInhoud(self, soup):
        body = soup.find("div", {"id": "job-description"})
        p = re.compile(r'<.*?>')
        inhoud = p.sub('', str(body))
        return inhoud

    def parse(self, websiteUrl):
        soup = self.getSoup(websiteUrl)

        # parsen
        titel = self.parseTitel(soup)
        try:
            werkgever = self.parseWerkgever(soup)
        except:
            werkgever = "-"
        try:
            locatie = self.parseLocatie(soup)
        except:
            locatie = "-"
        inhoud = self.parseInhoud(soup)
        websiteUrl = re.sub(r'(?s)/\*.*\*/', '', websiteUrl)
        datum = time.strftime("%d-%m-%Y")
        # generate id (string)
        id = self.website + "-" + re.sub(r'\W+', '', titel)

        # make document to be send to elasticsearch database
        document = self.makeDocument(id, titel, websiteUrl, self.website,
                                     datum, werkgever, locatie, "-", inhoud)

        #indexeren (stoppen) van vacaturen in esDb
        self.es.index('vacature-index',
                      'vacature',
                      document,
                      id=document['id'])
        print "Es: " + titel