Exemplo n.º 1
0
    def GroupDBMaster(self):

        logger.debug('Dynamic Scrapping Consumer --- Group Master Called')

        Groupdb = DynamicCommonConnection.MySQLConnection()

        group = Groupdb.cursor()
        group.execute("select * from tbl_Bli_GroupMaster")
        grouplist = []
        for row in group.fetchall():
            BusinessType = row[6]

            if "Retail" in BusinessType:

                GroupRowList = (row[1], row[2] or 0)
                grouplist.append(GroupRowList)

            elif "Hotel" in BusinessType:
                GroupRowList = (row[1], row[2] or 0)
                grouplist.append(GroupRowList)

        grouplist_dict = dict(grouplist)

        group.close()
        Groupdb.close()
        return grouplist_dict
    def DatabaseQueryCall(self):

        logger.debug("Dynamic Parser Producer ---  Database Called")
        '''
        Query Logic task
        :return:
        '''
        print("Parser Database Called ")

        last_updated_date = self.mongodb.ParserQueueUpdate.find_one({'_id': 1})
        Last_Parser_update_date = last_updated_date['QueueUpdateDateTime']
        #records = db.HTMLRepository.find({'$and': [{'TimeStamp': {'$gte': Last_Parser_update_date}}]})

        records = self.mongodb.HTMLRepository.find({'parsingStatus': 1})
        print(records)

        if not records:

            SYSdate = datetime.datetime.now()
            self.mongodb.ParserQueueUpdate.update({'PARSER': '1'}, {
                "$set": {
                    'QueueUpdateDateTime':
                    datetime.datetime.strftime(SYSdate, '%Y-%m-%d %H:%M:%S')
                }
            })

        return records
Exemplo n.º 3
0
    def callback(self, ch, method, properties, body):
        #print(type(body))
        print("Receiving Messages -- %r" % body)
        print("Consumer running Time", datetime.datetime.now())

        try:
            data = body.decode('utf-8')
            consume_data = data.replace("'", "\"")
            consume_data = json.loads(
                consume_data)  # convert string to python dict

        except Exception as e:
            data = body.decode('utf-8')
            consume_data = eval(data)

            # print('Error while converting into JSON')
            # logger.error("Consumer Input Json not properly Serialize"+ str(e))
        #st()
        scexec = ScriptsExecution(consume_data)
        data = scexec.run()

        # try:
        #     scexec = ScriptsExecution(consume_data)
        #     data=scexec.run()
        #     print("Received as a response",data)
        # except Exception as e:
        #     print('Error Occur Check logs',str(e))
        #     logger.error('Error at ConsumerRequestDataScraper:' +str(e))
        #     data = None
        # if data:
        ch.basic_ack(delivery_tag=method.delivery_tag)
        logger.debug('Crawling done')
Exemplo n.º 4
0
    def Main(self):
        '''
        Main will be called for all the functioning
        '''

        try:
            logger.debug('Dynamic Scrapping Consumer ---main Function called')

            grouplistDict = self.GroupDBMaster()
            channel = self.RabbitConnection()

            print(grouplistDict)
            for key, value in grouplistDict.items():
                print("Sequence", key)
                for _ in range(int(value)):
                    try:
                        channel.basic_consume(DynamicConsumer.callback,
                                              queue=str(key),
                                              consumer_tag=None)
                    except pika.exceptions.ChannelClosed:
                        logger.error('Closed or no Queue ' + key)
                        print('Closed or no Queue ' + key)
                        channel = self.RabbitConnection()

            channel.start_consuming()
        except pika.exceptions.ConnectionClosed:
            DynamicConsumer().Main()
Exemplo n.º 5
0
def newconsume():
    logger.debug('Dynamic Scrapping Reparse --- Ready to start')
    obj = ReparseConsumer()
    #t1 = threading.Thread(target=obj.Main, args=())
    #t1.start()
    obj.Main()
    print("Consumer running Time", datetime.datetime.now())
    time.sleep(0.1)
Exemplo n.º 6
0
    def Classification(self):
        logger.debug("Classfication Called")
        if (self.IsCategory == 1 or self.IsCategory == '1'):

            return ScriptsExecution.Category(self)
        else:

            #return self.Product()
            return ScriptsExecution.Product(self)
Exemplo n.º 7
0
    def RabbitConnection(self):

        logger.debug(
            'Dynamic Scrapping Consumer --- rabbitMQ Connection called')

        connection = pika.BlockingConnection(
            pika.ConnectionParameters('localhost'))
        channel = connection.channel()
        channel.basic_qos(prefetch_count=1)
        return channel
Exemplo n.º 8
0
    def __init__(self):
        #threading.Thread.__init__(self)
        self.connection = pika.BlockingConnection(
            pika.ConnectionParameters(host='localhost'))
        self.channel = self.connection.channel()

        args = {}
        args["x-max-length"] = 100000000
        args['x-max-priority'] = 9

        logger.debug("Dynamic Queue Connection established")
Exemplo n.º 9
0
def getRequest():
    # db = pymysql.connect(host="192.168.8.67",
    #                      user="******",
    #                      passwd="eclerx#123",
    #                      db="eCube_Centralized_DB")

    # db = pymysql.connect(host="localhost",
    #                      user="******",
    #                      passwd="eclerx#123",
    #                      db="eCube_Centralized_DB")

    # db = pymysql.connect(host="192.168.8.37",
    #                      user="******",
    #                      passwd="eclerx#123",
    #                      db="eCube_Centralized_DB")

    # db = pymysql.connect(host="192.168.131.23",
    #                      user="******",
    #                      passwd="Eclerx#123",
    #                      db="eCube_Centralized_DB")

    db = pymysql.connect(**crawling_producer_config.get_pymysql_kwargs)
    cur = db.cursor()
    # return cur
    print("running")
    logger.debug('Database Connection Established')
    # cur = getConnection()

    # domainName = request.args.get('domainName')
    # domainName = 'http://www.mouser.com'

    # query_string = "select b.Id, a.DomainId, HeaderName, HeaderValue from eCube.tbl_DomainMaster a, eCube.tbl_DomainHeaderMapping b WHERE a.Id = b.DomainId and a.DomainName = '{domainName}'".format(domainName=domainName)
    # try:
    if True:
        # cur.execute(query_string)
        cur.callproc('spGetRequestRunDetail')
        res = cur.fetchall()
        # print("aetos res")
        # print(res)
        for r in res:
            print(r)
            # print(r[0],r[1])
            print(r[0], r[1], r[-1])
            # SaveRequest(r[0], r[1], cur,db)
            SaveRequest(r[0], r[1], r[-1], cur, db)
            UpdateStatus(r[0], cur, db)

        # r = [dict((cur.description[i][0], value)
        #           for i, value in enumerate(row)) for row in cur.fetchall()]
        cur.close()
    def __init__(self):

        threading.Thread.__init__(self)
        self.connection = pika.BlockingConnection(
            pika.ConnectionParameters(host='localhost'))
        self.channel = self.connection.channel()
        args = {}

        args["x-max-length"] = 10000000
        args['x-max-priority'] = 9

        self.mongodb = CommonConnection.MongoConnection()
        self.db = CommonConnection.MySQLConnection()
        self.RabbitCon = CommonConnection.RabbitMQConnection()
        self.IPAddr = CommonConnection.ServivesIP()

        logger.debug("Dynamic Queue Connection established")
Exemplo n.º 11
0
    def ParserDynamicDBConnection(self):
        '''
        DB Connection here
        :return:
        '''
        logger.debug('Dynamic Parsing Consumer --- Group Master Called')
        Groupdb = CommonConnection.MySQLConnection()
        group = Groupdb.cursor()
        group.execute("select * from tbl_Bli_GroupMaster")
        grouplist = []
        for row in group.fetchall():
            GroupRowList = (row[1], row[2] or 0)
            grouplist.append(GroupRowList)

        grouplist_dict = dict(grouplist)

        group.close()
        Groupdb.close()
        return grouplist_dict
Exemplo n.º 12
0
def UpdateStatus(RequestRunId, cur, db):
    print('updated')
    # domainName = request.args.get('domainName')
    # domainName = 'http://www.mouser.com'
    # conn = mysql.connect()
    # cur = mysql.connect().cursor()
    # query_string = "select b.Id, a.DomainId, HeaderName, HeaderValue from eCube.tbl_DomainMaster a, eCube.tbl_DomainHeaderMapping b WHERE a.Id = b.DomainId and a.DomainName = '{domainName}'".format(domainName=domainName)
    try:
        # cur.execute(query_string)
        args = [RequestRunId]
        # cur.callproc('spInsertRequestDetails',args)
        # res = cur.fetchall()
        cur.callproc('spUpdateRequestStatus', args)
        db.commit()

        # r = [dict((cur.description[i][0], value)
        #           for i, value in enumerate(row)) for row in cur.fetchall()]
    except Exception as e:
        logger.debug('Error Returned  by spUpdateRequestStatus Query', str(e))
        return jsonify({'StatusCode': 500, 'ResultData': e})
Exemplo n.º 13
0
    def MessageQueryCall(self):

        logger.debug("Dynamic Scrapping Producer -- Queue Called")

        db = CommonConnection.MySQLConnection()

        DBMeesages = db.cursor()
        '''
        SP Called Location 

        '''

        DBMeesages.callproc("MessagingHotelQueue")

        message = DBMeesages.fetchall()
        logger.debug("Dynamic Queue HOTEL  Db Connection called")

        DBMeesages.close()
        db.close()

        return message
Exemplo n.º 14
0
    def QueueGetCount(self):

        logger.debug(
            'Dynamic Scrapping Consumer --- To Get Queue Count called')
        '''
        To get Count How many Messages is in Queue
        :return:
        '''
        args = {}
        args["x-max-length"] = 100000000
        args['x-max-priority'] = 9

        self.Count_QUEUE1 = self.channel.queue_declare(
            queue='RS', durable=True, arguments=args).method.message_count
        self.Count_QUEUE2 = self.channel.queue_declare(
            queue='Arrow', durable=True, arguments=args).method.message_count

        self.Count_QUEUE3 = self.channel.queue_declare(
            queue='Conrad', durable=True, arguments=args).method.message_count

        return self.Count_QUEUE1, self.Count_QUEUE2, self.Count_QUEUE3
Exemplo n.º 15
0
    def __init__(self, consume_data):

        #global IsPreview,StartTime,RequestId, RequestRunId, SubRequestId, RequestUrl, DomainName, PointOfSale, IsCategory, ScraperScript, ParserScript, ScraperModuleName, ParserModuleName, Country
        #IsCategory = "1"
        self.RequestInputs = consume_data
        self.IsPreview = "No"
        self.StartTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        self.RequestId = consume_data["RequestId"]
        self.RequestRunId = consume_data["RequestRunId"]
        self.SubRequestId = consume_data["SubRequestId"]
        self.RequestUrl = consume_data['RequestUrl']
        self.DomainName = consume_data["DomainName"]
        self.PointOfSale = consume_data["PointOfSale"]
        self.IsCategory = consume_data["IsCategory"]
        self.ScraperScript = consume_data["ScraperScript"]
        #ScraperScript = "ConradPython_IT"
        self.ParserScript = consume_data["ParserScript"]
        self.Country = consume_data["Country"]
        self.ScraperModuleName = ''
        # Country = consume_data["Region"]

        # Added by Ankush for Retais /hotel Request Input JSON
        # self.RequestInputs = consume_data['RequestInputs']

        # Added by ankush for Dynamic Queuing
        self.GroupName = consume_data['GroupName']

        if consume_data["ScraperScript"]:
            self.ScraperModuleName = consume_data["ScraperScript"]
            self.ScraperModuleName = re.sub(".py", "", self.ScraperModuleName)

            #ScraperModuleName ="ScrapperConradPython_IT"
        if consume_data["ParserScript"]:
            self.ParserModuleName = consume_data["ParserScript"]
            self.ParserModuleName = re.sub(".py", "", self.ParserModuleName)

        logger.debug('Initialisation complete...:' + str(self.RequestUrl))
Exemplo n.º 16
0
    def Product(self):
        logger.debug("Product Called from Category :" + str(self.RequestUrl))
        Instance = ScriptsExecution.getInstance(self)
        dict = Instance.getProductCrawl()

        # dict['Starttime'] = str(StartTime)
        # dict['EndTime'] = str(datetime.now())
        #
        #StartTime = datetime.datetime.now()
        #StartTimeConverted = datetime.date.strftime(StartTime, '%Y-%m-%d %H:%M:%S')
        dict['startDT'] = self.StartTime

        EndTime = datetime.datetime.now()
        EndTimeConverted = datetime.date.strftime(EndTime, '%Y-%m-%d %H:%M:%S')
        dict['endDT'] = EndTimeConverted

        dict['requestId'] = int(self.RequestId)
        dict['subRequestId'] = int(self.SubRequestId)
        dict['RequestRunId'] = int(self.RequestRunId)
        # Added by Ankush
        dict['ParserScript'] = str(self.ParserScript)

        # added by ankush
        dict['groupName'] = self.GroupName
        dict['parsingStatus'] = 1  # Message ready for Parsing

        r = None

        try:
            r = json.dumps(dict)
        except Exception as e:

            logger.error('JSON Dump error: ', exc_info=True)

        loaded_r = json.loads(r)

        logger.debug('Scrapping Done : ' + self.RequestUrl)
        self.result = requests.post(
            'http://192.168.8.7/site3/api/v1/SaveSourceHtml', json=r)

        #self.result = requests.post('http://192.168.7.128/site3/api/v1/SaveSourceHtml', json=r)
        logger.debug("Database Saving Response for " + str(self.RequestId) +
                     " : " + self.RequestUrl + " : " +
                     str(self.result.content))
        print("Saved Response------", self.result.content)

        try:
            Data = json.dumps(self.RequestInputs)
            updateSubID = requests.post(
                'http://192.168.8.7/site7/api/v1/update_subrequest_details',
                json=Data)
            print("UPDATE SbRequestID Service", updateSubID)
        except Exception as e:
            pass

        #print("Response recieved",self.result.content)
        return self.result.content
    def run(self):

        SUBREQUESTID = []

        logger.debug("Dynamic Parser Producer ---  Main Function alled")

        records = ParserDynamicProducer.DatabaseQueryCall(self)

        for msg in records:

            if msg:
                self.channel.basic_publish(exchange='',
                                           routing_key="Parser" +
                                           str(msg['groupName']),
                                           body=str(msg))
                print("Queue Sending Messages", msg)

            SUBREQUESTID.append(msg['subRequestId'])

        self.connection.close()

        ParserDynamicProducer.UpdateSUBRequest(self, SUBREQUESTID)

        logger.debug("Dynamic Queue Connection closed successfully")
Exemplo n.º 18
0
            elif data == "Access Denied":

                AccessDenied = DATA_MAKER(consume_data)
                print("Access Denied", AccessDenied)

                DataInsert = db.PNFData.insert(AccessDenied)

                ch.basic_ack(delivery_tag=method.delivery_tag)

            else:

                ch.basic_ack(delivery_tag=method.delivery_tag)

    '''
    Below Code commented and added in startParserConsumerServices.py file 
    '''

    print("Connected")
    channel = Rabbit_connection.ParserQueueConnection(
        "")  # calling Category Queue Connection class
    channel.basic_qos(prefetch_count=1)
    channel.basic_consume(callback, queue='Parser')
    channel.start_consuming()


logger.debug('Consumer Ready to start')

t1 = threading.Thread(target=ParserConsumer, args=[])
t1.start()
time.sleep(0.08)
Exemplo n.º 19
0
def SaveRequest(RequestRunId, RequestId, ReqModeId, cur, db):
    try:
        #     cur = getConnection()

        # cur.execute(query_string)
        args = [RequestId, RequestRunId, ReqModeId]
        # cur.callproc('spInsertRequestDetails',args)
        # res = cur.fetchall()

        # cur.callproc(procname='spInsertRequestDetails', args=args)

        # Code Change done for Hotel by Shrikant 04-06-2017 19:52 #

        if ReqModeId == 1:
            cur.callproc(procname='spInsertRequestDetails', args=args)
        if ReqModeId in (2, 3):
            args1 = [RequestId]
            print("args1 1")
            print(args1)
            cur.callproc('spGetPreCrawlDetails', args=args1)
            res = cur.fetchall()
            for i in res:
                if i[6] == 1:  # Based on Boardtype ID.
                    startDate = i[4]
                    endDate = i[5]
                    for n in range((endDate - startDate).days + 1):
                        thisdate = startDate + timedelta(n)
                        if thisdate.strftime('%A') in i[7]:
                            args2 = [i[1], RequestRunId, i[0], thisdate]
                            print('Arg2', args2)
                            if ReqModeId == 2:
                                cur.callproc(
                                    'spInsertHotelCrawlRequestDetails',
                                    args=args2)
                            elif ReqModeId == 3:
                                cur.callproc(
                                    'spInsertHotelFlightCrawlRequestDetails',
                                    args=args2)
                    db.commit()
                elif i[6] == 2:
                    for advancedt in i[10].split(','):
                        thisdate = date.today() + timedelta(int(advancedt))
                        args2 = [i[1], RequestRunId, i[0], thisdate]
                        print('Arg2 1', args2)
                        if ReqModeId == 2:
                            cur.callproc('spInsertHotelCrawlRequestDetails',
                                         args=args2)
                        elif ReqModeId == 3:
                            cur.callproc(
                                'spInsertHotelFlightCrawlRequestDetails',
                                args=args2)
                    db.commit()
                elif i[6] == 3:
                    for advancedt in i[10].split(','):
                        thisdate = datetime.strptime(advancedt,
                                                     '%m/%d/%Y').date()
                        args2 = [i[1], RequestRunId, i[0], thisdate]
                        print('Arg2 2', args2)
                        if ReqModeId == 2:
                            cur.callproc('spInsertHotelCrawlRequestDetails',
                                         args=args2)
                        elif ReqModeId == 3:
                            cur.callproc(
                                'spInsertHotelFlightCrawlRequestDetails',
                                args=args2)
                    db.commit()
                elif i[6] == 4:
                    for advancedt in i[10].split(','):
                        thisdate = date.today() + timedelta(int(advancedt) * 7)
                        args2 = [i[1], RequestRunId, i[0], thisdate]
                        print('Arg2 3', args2)
                        if ReqModeId == 2:
                            cur.callproc('spInsertHotelCrawlRequestDetails',
                                         args=args2)
                        elif ReqModeId == 3:
                            cur.callproc(
                                'spInsertHotelFlightCrawlRequestDetails',
                                args=args2)
                    db.commit()

        # cur.execute('spInsertRequestDetails', args)
        # cur.fetchall()

        # cur.close()

        # r = [dict((cur.description[i][0], value)
        #           for i, value in enumerate(row)) for row in cur.fetchall()]
    except Exception as e:
        logger.debug('Error Returned  by spInsertRequestDetails Query', str(e))
        return jsonify({'StatusCode': 500, 'ResultData': e})
Exemplo n.º 20
0
        #st()
        scexec = ScriptsExecution(consume_data)
        data = scexec.run()

        # try:
        #     scexec = ScriptsExecution(consume_data)
        #     data=scexec.run()
        #     print("Received as a response",data)
        # except Exception as e:
        #     print('Error Occur Check logs',str(e))
        #     logger.error('Error at ConsumerRequestDataScraper:' +str(e))
        #     data = None
        # if data:
        ch.basic_ack(delivery_tag=method.delivery_tag)
        logger.debug('Crawling done')

    # channel = Rabbit_connection.CrawlerQueueConnection("")   # calling Category Queue Connection class
    # channel.basic_qos(prefetch_count=1)
    # channel.basic_consume(callback,
    #                   queue='Crawler')
    # channel.start_consuming()


'''
logger.debug('Consumer Ready to start')
t1 = threading.Thread(target=CrawlerConsumer,args=[])
t1.start()
print("Consumer running Time",datetime.datetime.now())
time.sleep(0.1)
'''
Exemplo n.º 21
0
    def callback(ch, method, properties, body):

        print("parser Consumer Message time", datetime.datetime.now())
        logger.debug('Consumer Ready to start')

        print("Recieving Messages --", body)
        data = body.decode('utf-8')

        # consume_data = data.replace("'", "\"")
        # consume_data = json.loads(consume_data)  # convert string to python dict

        consume_data = ast.literal_eval(data)

        subRequestId = consume_data['subRequestId']
        client = MongoClient('localhost', 27017)
        #client = MongoClient('192.168.7.134', 27017)
        db = client.HTMLDumps
        consume_data = db.HTMLRepository.find_one(
            {'subRequestId': subRequestId})

        if consume_data['IsCategory'] == "Yes":

            json_data = json.dumps(consume_data)
            try:
                result = requests.post(
                    'http://192.168.8.7/site3/api/v1/SaveResponseData',
                    json=consume_data)
                #result = requests.post('http://192.168.7.128/site3/api/v1/SaveResponseData', json=consume_data)
            except Exception as e:
                pass
            print(result)
            if result:
                ch.basic_ack(delivery_tag=method.delivery_tag)

        else:
            data = ScriptsExecution.ConsumerRequestData('', **consume_data)
            '''
            try:
                data = ScriptsExecution.ConsumerRequestData('',**consume_data)
                print(data)
            except Exception as e:
                # logger.error('Script Executer Return Error',str(e))
                print("Script Executer error",e)
                data  = None
                  data = "Error"
            '''

            if data == "PNF":

                PNFData = DATA_MAKER(consume_data)
                print("PNFDATA", PNFData)

                ch.basic_ack(delivery_tag=method.delivery_tag)

                DataInsert = db.PNFData.insert(PNFData)

            elif data == "Access Denied":

                AccessDenied = DATA_MAKER(consume_data)
                print("Access Denied", AccessDenied)

                DataInsert = db.PNFData.insert(AccessDenied)

                ch.basic_ack(delivery_tag=method.delivery_tag)

            else:

                ch.basic_ack(delivery_tag=method.delivery_tag)
Exemplo n.º 22
0
    def run(self):
        logger.debug("Dynamic Scrapping Producer --- Main function  Called")
        '''
        Database Query Call 
        '''

        message = DynamicProducer.MessageQueryCall(self)

        if message:
            DumpDict = []
            l = []

            for row in message:

                row_list = [
                    ("RequestId", row[0]),
                    ("SubRequestId", row[1]),
                    ("RequestRunId", row[2]),
                    ("RequestUrl", str(row[3]) or str('')),
                    ('IsCategory', row[4]),
                    # ('IsCategory', str(row[4]) or str('')),    # commented for mouser testing
                    ("DomainName", row[5] or str('')),
                    ('ParserScript', row[6] or str('')),
                    ('ScraperScript', row[7] or str('')),
                    ("PointOfSale", row[8] or str('')),
                    ('Country', row[9] or str('')),
                    ('Region', "India"),
                    ('GroupName', row[10] or ""),
                    ("DomainID", row[11] or str('')),
                    ("BusinessType", "Retail")
                ]

                data_row_dict = dict(row_list)

                data_row_dict.update({
                    "RequestInput": {
                        "RequestUrl": data_row_dict['RequestUrl']
                    }
                })

                #print("Crawler Producer Input Dictionary", data_row_dict)

                DumpDict.append(data_row_dict)

            for newDump in DumpDict:
                a = (newDump['DomainID'])
                l.append(a)
            newdata = set(l)
            lDATA = (list(newdata))

            if len(lDATA) > 1:

                NewMessageDump = DynamicProducer.DomainWiseFetchALL(
                    "", DumpDict)
                Multi_SubRequest = []
                for message in NewMessageDump:
                    self.channel.basic_publish(exchange='',
                                               routing_key=str(
                                                   message['GroupName']),
                                               body=str(message))
                    print("Queue Sending Multiple Domain Messages -----------",
                          message)

                    Multi_SubRequest.append(message['SubRequestId'])

                MultiStatusUpdate = DynamicProducer.UpdateStatusPushedToQueue(
                    Multi_SubRequest)

                print(
                    "Messages Pushed in  and Status updated  in MySQl crawl table"
                )

            else:

                Single_SubRequest = []
                SingleFetch = DynamicProducer.SingleDomainEntry("", DumpDict)
                cnt = 0

                for single_message in SingleFetch:
                    self.channel.basic_publish(
                        exchange='',
                        routing_key=str(single_message['GroupName']),
                        body=str(single_message))

                    print("Queue Sending Single Domain Messages ------------",
                          single_message)

                    Single_SubRequest.append(single_message['SubRequestId'])
                    cnt += 1

                MultiStatusUpdate = DynamicProducer.UpdateStatusPushedToQueue(
                    Single_SubRequest)

                if MultiStatusUpdate == "Successfully updated Status":

                    time.sleep(10)
                    t2 = DynamicProducer()
                    #t2.start()

        self.connection.close()
        logger.debug("Dynamic Queue Connection closed successfully")
Exemplo n.º 23
0
    def run(self):
        logger.debug("Dynamic Scrapping Producer --- Main function  Called")
        '''
        Database Query Call 
        '''
        message = HotelCrawling.MessageQueryCall(self)
        if message:
            DumpDict = []
            l = []
            for row in message:
                row_list = [("requestId", row[0]), ("subRequestId", row[1]),
                            ("requestRunId", row[2]), ('IsCategory', row[3]),
                            ("DomainName", row[4] or str('')),
                            ('ParserScript', row[5] or str('')),
                            ('ScraperScript', row[6] or str('')),
                            ('GroupName', row[7] or ""),
                            ("DomainID", row[8] or str('')),
                            ("RequestUrl", str('')), ("PointOfSale", ''),
                            ("BusinessType", "Hotel"),
                            (
                                "country",
                                row[19] or "",
                            )]

                data_row_dict = dict(row_list)
                data_row_dict.update({
                    "RequestInputs": {
                        "RequestUrl": data_row_dict['RequestUrl'],
                        "checkIn": str(row[9]) or "",
                        "nights": row[10] or "",
                        "CompetitorName": row[11] or "",
                        "pos": row[12] or "",
                        "adults": row[13] or "",
                        "children": row[14] or "",
                        "CrawlMode": row[15] or "",
                        "hotelName": row[16] or "",
                        "webSiteHotelId": row[17] or "",
                        "city": row[18] or "",
                        "starRating": row[20] or "",
                        "board": row[21] or "",
                        "room": row[22] or ""
                    },
                })

                DumpDict.append(data_row_dict)

            for newDump in DumpDict:
                a = (newDump['DomainID'])
                l.append(a)
            newdata = set(l)
            lDATA = (list(newdata))
            if len(lDATA) > 1:
                NewMessageDump = HotelCrawling.DomainWiseFetchALL("", DumpDict)
                Multi_SubRequest = []
                for message in NewMessageDump:
                    self.channel.basic_publish(exchange='',
                                               routing_key=str(
                                                   message['GroupName']),
                                               body=str(message))
                    print("Queue Sending Multiple Domain Messages -----------",
                          message)

                    Multi_SubRequest.append(message['subRequestId'])

                MultiStatusUpdate = HotelCrawling.UpdateStatusPushedToQueue(
                    Multi_SubRequest)

                print(
                    "Messages Pushed in  and Status updated  in MySQl crawl table"
                )

            else:

                Single_SubRequest = []
                SingleFetch = HotelCrawling.SingleDomainEntry("", DumpDict)
                cnt = 0

                for single_message in SingleFetch:
                    self.channel.basic_publish(
                        exchange='',
                        routing_key=str(single_message['GroupName']),
                        body=str(single_message))

                    print("Queue Sending Single Domain Messages ------------",
                          single_message)

                    Single_SubRequest.append(single_message['subRequestId'])
                    cnt += 1

                MultiStatusUpdate = HotelCrawling.UpdateStatusPushedToQueue(
                    Single_SubRequest)

                if MultiStatusUpdate == "Successfully updated Status":

                    time.sleep(10)
                    t2 = HotelCrawling()

        self.connection.close()
        logger.debug("Dynamic Queue Connection closed successfully")
Exemplo n.º 24
0
    def callback(ch, method, properties, body):

        # print("Method Name --------------", method)
        # print("Properties Name ----------", properties)
        logger.debug('Dynamic Scrapping Consumer ---CallBack function called')
        '''
        :param ch:   connection channel
        :param method:  method name
        :param properties: priority Properties
        :param body:  message
        :return:
        '''

        # print("Receiving Messages ------------ %r" % body)
        # print("Consumer running Time", datetime.datetime.now())

        try:
            data = body.decode('utf-8')
            consume_data = data.replace("'", "\"")
            consume_data = json.loads(
                consume_data)  # convert string to python dict

        except Exception as e:
            data = body.decode('utf-8')
            consume_data = eval(data)

        print('queue message for SR:%s' % consume_data['subRequestId'])
        '''
        if consume_data['GroupName'] =='Hotelbeds':
            try:
                scexec=ScriptHandler(consume_data)
                data=scexec.execute_crawl()
            except Exception as e:
                print("Script Executer return error", str(e))
        '''

        # from pdb import set_trace; set_trace()
        if "Retail" in consume_data['BusinessType']:
            try:
                scexec = ScriptsExecution(consume_data)
                data = scexec.run()
                print(" Received as a response", data)
            except Exception as e:
                print("Retail Script Executer return error", str(e))
                #logger.error("Script Executer return error", str(e))

        elif "Hotel" in consume_data['BusinessType']:
            error = None
            error_code = ''
            # sub_req_id = consume_data['subRequestId']
            ch.basic_ack(delivery_tag=method.delivery_tag)
            try:
                crawled_data = ScriptHandler(consume_data).execute_crawl(
                    method.redelivered)
            except core_exceptions.ScriptPNF:
                error = True
                error_code = 'script_timeout'
                crawled_data = consume_data.copy()

            crawled_hotel_count = len(crawled_data.get('hotels', list()))
            if not crawled_hotel_count and not error_code == 'script_timeout':
                error = True
                error_code = 'empty_hotels'
            print("\n\n\ncrawled hotels, error")
            print(crawled_hotel_count, bool(error), error_code)

            if error and error_code == 'empty_hotels':
                print('\n\n\nEmpty Hotels')
                core_services.MongoHandler().save_pnf(crawled_data, error)
                pnf_update_query = "UPDATE tbl_HotelCrawlRequestDetail SET StatusId = 8 WHERE HotelCrawlRequestDetailId = %s AND StatusId = 11"
                conn = DynamicCommonConnection.MySQLConnection()
                cur = conn.cursor()
                cur.execute(pnf_update_query % crawled_data['subRequestId'])
                conn.commit()
                cur.close()
                conn.close()
            elif error and error_code == 'script_timeout':
                print('\n\n\nScript Timeout')
                connection2 = pika.BlockingConnection(
                    pika.ConnectionParameters(host='localhost'))
                channel2 = connection2.channel()
                channel2.queue_declare("Reparse")
                channel2.basic_publish(exchange='',
                                       routing_key='Reparse',
                                       body=json.dumps(crawled_data))
                connection2.close()
            else:
                print('\n\n\nGREAT SUCCESS')
                core_services.MongoHandler().save_successful_crawl(
                    crawled_data)
                connection2 = pika.BlockingConnection(
                    pika.ConnectionParameters(host='localhost'))
                channel2 = connection2.channel()
                channel2.queue_declare("Parser")
                channel2.basic_publish(exchange='',
                                       routing_key='Parser',
                                       body=json.dumps(crawled_data))
                connection2.close()
Exemplo n.º 25
0
                    print("Queue Sending Single Domain Messages ------------",
                          single_message)

                    Single_SubRequest.append(single_message['subRequestId'])
                    cnt += 1

                MultiStatusUpdate = HotelCrawling.UpdateStatusPushedToQueue(
                    Single_SubRequest)

                if MultiStatusUpdate == "Successfully updated Status":

                    time.sleep(10)
                    t2 = HotelCrawling()

        self.connection.close()
        logger.debug("Dynamic Queue Connection closed successfully")


if __name__ == '__main__':
    logger.debug("Dynamic Producer Scrapping started ")
    # t1 = DynamicProducer()
    # t1.start()
    print("SCRIPT CALLED --- ")
    while True:
        t1 = HotelCrawling()
        t1.run()
        # t1.setDaemon(True)
        # t1.start()
    time.sleep(10)
Exemplo n.º 26
0
def DBFetchData():
    '''
        Database Selection Query call function
        :return:
    '''

    logger.debug("Priority Parser Producer --- Database Function called")

    client = MongoClient('localhost', 27017)
    mongoDB = client.HTMLDumps

    db = CommonConnection.MySQLConnection()

    group = db.cursor()
    '''
    Status ID - 8 Is "Reparse" status in tbl_RequestRunDetail table
    '''

    group.execute(
        "select RequestRunId from  tbl_RequestRunDetail where FK_StatusId = 10"
    )

    data = group.fetchall()
    requestRunIDList = []
    if data:
        for row in data:
            requestRunIDList.append(row[0])

    newRequestRunId = list(set(requestRunIDList))
    messages = []
    for requestRunID in newRequestRunId:
        records = mongoDB.HTMLRepository.find({
            '$and': [{
                'RequestRunId': {
                    '$eq': str(requestRunID)
                }
            }, {
                'Error': "0"
            }]
        })

        for row in records:
            messages.append(row)
    if messages:
        SYSdate = datetime.datetime.now()
        mongoDB.ParserQueueUpdate.update({'PARSER': '1'}, {
            "$set": {
                'ReParseQueueUpdateDateTime':
                datetime.datetime.strftime(SYSdate, '%Y-%m-%d %H:%M:%S')
            }
        })

    if data:
        for UpdateRunID in data:
            UpdateRequestRunId = UpdateRunID[0]
            '''
            update tbl_RequestRunDetail Status = Push to Queue After Adding Reparse status records into ReParse Queue  
            '''

            group.execute(
                "update tbl_RequestRunDetail  set ReParseStatus = 'Running' where RequestRunId = %s",
                (UpdateRequestRunId))
            db.commit()

    group.close()
    db.close()

    return messages
Exemplo n.º 27
0
    :return:
    '''
    return True


def ProducerMain():

    messages = DBFetchData()
    channel = RabbitConnection()

    if messages:
        for row in messages:
            message = row
            priority = 9
            print(message)
            channel.basic_publish(exchange='',
                                  routing_key='parser',
                                  body=str(message),
                                  properties=pika.BasicProperties(
                                      delivery_mode=2, priority=priority))


logger.debug("Priority Parser Producer called")
main = ProducerMain()

# while True:
#     print("Start:", datetime.datetime.now())
#     main = ProducerMain()
#     sleep(60)
#     print("Sleep Time ")