Exemplo n.º 1
0
    def start_read_service(data_directory,
                           data_store_file,
                           device_type,
                           read_frequency=15,
                           server_url=None):
        # step 1, parameters check
        status, error = FlaskHelper.parameter_check(data_directory,
                                                    data_store_file,
                                                    device_type, server_url)
        if status is False:
            print("BIB read server start failed, failed reason: %s" % error)
            return
        # step 2, start server
        reader = BibReader(data_directory)
        print("Start service reading bib device records!")
        while True:
            new_data = reader.read_new_data()
            print("\nREADING....NEW...DATA")
            print(os.linesep.join(new_data))
            print("Finished reading!\n")
            SystemHelper.append_to_file(data_store_file,
                                        os.linesep.join(new_data))

            # send data to server back-end
            send_data = {'deviceType': device_type, 'dataBatch': new_data}
            if server_url is not None and len(new_data) > 0:
                HttpHelper.post(server_url, send_data)
            time.sleep(read_frequency)
Exemplo n.º 2
0
def uploadPage(source):
    try:
        total = 0
        while True:
            if source == 'page':
                docList = pageCollection.nextPage(20)
            else:
                docList = keyCollection.nextPage(20)
            if docList == None or len(docList) == 0:
                break

            for doc in docList:
                if doc['state'] != 'PARSED':
                    continue

                # search wp by title
                req = {'title': doc['title']}
                errorCode, rsp = HttpHelper.post(QUERY_URL, req)
                if errorCode != 'OK':
                    raise Exception('query error, url=' + doc['url'])
                if rsp['errorCode'] == 'ERROR':
                    doc['state'] = 'DUPED'
                    pageCollection.updateOne(doc)
                    continue

                # upload
                postTitle = doc['pageTitle']
                if postTitle == None:
                    postTitle = doc['title']
                postExcerpt = doc['pageDescription']
                if postExcerpt == None:
                    postExcerpt = doc['description']
                req = {
                    'ID': 0,
                    'author': 1,
                    'title': postTitle,
                    'excerpt': postExcerpt,
                    'content': doc['content'],
                    'categories': [1]
                }
                errorCode, rsp = HttpHelper.post(INSERT_URL, req)
                if errorCode != 'OK':
                    raise Exception('insert error, url=' + doc['url'])

                if rsp['errorCode'] == 'ERROR':
                    doc['state'] = 'POSTERROR'
                else:
                    doc['state'] = 'POSTED'
                pageCollection.updateOne(doc)

                total += 1
                print('total=' + str(total))

                time.sleep(1)

    except Exception as err:
        print(err)
Exemplo n.º 3
0
def scrape(logfile):
    common_words = load_common_words()
    referer = BASE_URL
    http_req = HttpHelper(BASE_URL, MAC_USER_AGENT, HOST, BASE_URL)
    print('Running...')
    for word in common_words:
        if word not in logfile.items:
            res = http_req.get(create_search_url(word))
            fname = "{0}/{1}.html".format(HTML_TRANSLATION_DOWNLOAD_FILEPATH, word)
            save_file(fname, res.text)
            logfile.append(word)
            time.sleep(2)
    print('Done!')
Exemplo n.º 4
0
def parsePage():
    try:
        total = 0
        while True:
            docList = pageCollection.findPage({'state': 'CREATED'}, 0, 20)
            if docList == None or len(docList) == 0:
                break

            for doc in docList:

                try:
                    total += 1
                    print('total=' + str(total))
                    print('url=' + doc['url'])

                    fileName, finalUrl = HttpHelper.fetchAndSave(
                        doc['url'], ROOT_PATH, 'utf-8', 2)
                    if fileName == None:
                        doc['state'] = 'CLOSED'
                        pageCollection.updateOne(doc)
                        continue

                    filePath = HttpHelper.getFullPath(ROOT_PATH, fileName, 2)
                    html = FileHelper.readContent(filePath)
                    pageTitle, pageDescription, pageContent = ParseHelper.parseWordPressContent(
                        html, True)
                    if pageContent != None and pageTitle != None and pageDescription != None:
                        doc['pageTitle'] = pageTitle
                        doc['pageDescription'] = pageDescription
                        doc['pageContent'] = pageContent
                        doc['state'] = 'PARSED'
                    else:
                        doc['state'] = 'CLOSED'
                    pageCollection.updateOne(doc)

                    time.sleep(1)
                except Exception as err:
                    print(err)
                    doc['state'] = 'CLOSED'
                    pageCollection.updateOne(doc)

    except Exception as err:
        print(err)
Exemplo n.º 5
0
def completeTask(task):
    try:
        req = {'uid': 'NET_0', 'task': task}
        errorCode, rsp = HttpHelper.post(DISPATCHER_URL + "/webapi/complete2",
                                         req)
        if rsp != None and 'errorCode' in rsp and rsp['errorCode'] == 'OK':
            print("complete task ok, task=" + json.dumps(task))
        else:
            print("complete task error, rsp=" + json.dumps(rsp))
    except Exception as err:
        print(err)
Exemplo n.º 6
0
def downloadTrackingLog(siteName, logDate=None):
    if logDate == None:
        now = datetime.now()
        logDate = now.strftime('%Y%m%d')
    logFileName = '{0}.{1}.log'.format(siteName, logDate)
    url = DOWNLOAD_URL + '/logs/{0}.{1}.log'.format(siteName, logDate)
    statusCode, html, finalUrl = HttpHelper.fetch(url)
    if statusCode == 200 and html != None and len(html) > 0:
        filePath = ROOT_PATH + '/' + logFileName
        FileHelper.writeContent(filePath, html)
        print('download log file ok, fileName=' + logFileName)
    else:
        print('download log file error, fileName=' + logFileName)
Exemplo n.º 7
0
def uploadKeyPage():
    try:
        total = 0
        while True:
            docList = keyCollection.nextPage(20)
            if docList == None or len(docList) == 0:
                break

            for doc in docList:
                total += 1
                print('total={0}, title={1}'.format(total, doc['title']))

                if doc['state'] != 'GENERATED':
                    print("invalid state, skip")
                    continue

                # upload
                postTitle = doc['finalTitle']
                postExcerpt = doc['finalDescription']
                postContent = doc['finalContent']
                if postTitle == None or postExcerpt == None or postContent == None:
                    print('invalid post, key=' + doc['title'])
                    doc['state'] = 'GENERATE_ERROR'
                    keyCollection.updateOne(doc)
                    continue

                req = {
                    'ID': 0,
                    'author': 1,
                    'title': postTitle,
                    'excerpt': postExcerpt,
                    'content': postContent,
                    'categories': [1]
                }
                errorCode, rsp = HttpHelper.post(INSERT_URL, req)
                if errorCode != 'OK':
                    raise Exception('insert error, url=' + doc['url'])

                if rsp['errorCode'] == 'OK':
                    doc['postID'] = rsp['ID']
                    doc['state'] = 'UPLOADED'
                    print('upload OK')
                else:
                    doc['state'] = 'UPLOAD_ERROR'
                    print('upload ERROR')
                keyCollection.updateOne(doc)

                time.sleep(1)

    except Exception as err:
        print(err)
Exemplo n.º 8
0
def fetchTask(task):
    try: 
        url = task['url']
        statusCode, html = HttpHelper.fetch(url)
        if html != None and len(html) > 0:
            print ("fetch task ok")
        else:
            print ("fetch task error")
        
        # parse and update    
        parseAndImport(url, html)
        
    except Exception as err :
        print(err)
Exemplo n.º 9
0
def eliminatePost():
    try:
        # Filter tracking, get top 100 post page
        top100List = trackCollection.findPage({}, 0, 100, 'count', 'desc')

        # Get all post id and permalinks
        statusCode, html, finalUrl = HttpHelper.fetch(PERMALINKS_URL)
        if statusCode != 200 or html == None:
            print(
                'get all post id and permalinks fails, statusCode={0}'.format(
                    statusCode))
        lines = html.split('\n')
        print('get all post id and links ok, len={0}'.format(len(lines)))
        eliminateList = []
        for line in lines:
            kv = line.split(';')
            if len(kv) == 2:
                postId = kv[0]
                postLink = kv[1]
                if not postLink in top100List:
                    eliminateList.append(postId)

        # Remove all post not int top 100
        print('eliminate len='.format(len(eliminateList)))
        for id in eliminateList:
            req = {
                'ID': id,
            }
            errorCode, rsp = HttpHelper.post(DELETE_URL, req)
            if errorCode != 'OK' or rsp['errorCode'] != 'OK':
                raise Exception('delete error, id={0}'.format(id))
            else:
                print('eleminate post ok, id={0}'.format(id))

    except Exception as err:
        print(err)
Exemplo n.º 10
0
def getTask():
    try:
        req = {'uid': 'NET_0', 'whiteList': ['default.robot']}
        errorCode, rsp = HttpHelper.post(DISPATCHER_URL + "/webapi/task2", req)
        if rsp != None and 'errorCode' in rsp and rsp[
                'errorCode'] == 'OK' and 'taskList' in rsp and rsp[
                    'taskList'] != None:
            task = rsp['taskList'][0]
            print("get task ok, task=" + json.dumps(task))
            return ['OK', task]
        else:
            print("get task error, rsp=" + json.dumps(rsp))
            return [rsp['errorCode'], None]

    except Exception as err:
        print(err)
        return ['UNKNOWN', None]
Exemplo n.º 11
0
def sendPage(task, html):
    dataCenter = task['dataCenter']
    if html != None and len(html) > 1024:
        req = {
            'task': task,
            'redirectUrl': task['url'],
            'page': {
                'content': None,
                'encoding': 'UTF8',
                'html': html,
            },
        }
        errorCode, rsp = HttpHelper.post(dataCenter + "webapi/page2", req)
        if rsp != None and 'errorCode' in rsp and rsp['errorCode'] == 'OK':
            print("sendPage OK: url: {0}".format(dataCenter + "webapi/page2"))
            return True
        else:
            print("sendPage ERROR: url: {0}".format(dataCenter +
                                                    "webapi/page2"))
            return False
Exemplo n.º 12
0
def testSoup():
    url = "https://www.drugs.com/comments/abobotulinumtoxina/"
    html = HttpHelper.fetch(url)

    soup = BeautifulSoup(html)

    #Remove all table in div
    tableList = soup.select("div.user-comment table")
    if (len(tableList) > 0):
        for table in tableList:
            table.decompose()

    # get div outer html
    divList = soup.select("div.user-comment")
    if (len(divList) > 0):
        reviewDivList = []
        for div in divList:
            divHtml = str(div)
            divText = div.text
            print(divHtml)
            reviewDivList.append(divHtml)
Exemplo n.º 13
0
def parseAndImport(url, html):
    md5 = CryptHelper.getMD5Hash(url)
    if html != None and len(html) > 0:
        found, doc = DocHelper.parseDoc(html)
        article, errorCode = DocHelper.doc2Article(doc, url)
        if errorCode != "OK" or article == None:
            article = {
                'id': md5,
                'status': article['status'],
            }
    else:
        if len(url) >= 1000:
            url = url[0:1000]
        article = {
            'id': md5,
            'status': article['status'],
        }

    articleList = []
    articleDoc = {
        'id': article['md5'],
        'title': article['title'],
        'excerpt': article['excerpt'],
        'content': article['content'],
        'author': article['author'],
        'domain': article['domain'],
        'categories': article['categories'],
        'tags': article['tags'],
        'url': article['url'],
        'status': article['status'],
        'key': article['key'],
    }
    articleList.append(articleDoc)
    errorCode, rsp = HttpHelper.post(IMPORT_URL, articleList)
    if errorCode == "OK" and rsp != None and 'isOk' in rsp and rsp['isOk'] == True:
        print ("import article ok, id=" + article['md5'])
    else:
        print ("import article error, id=" + article['md5'])
Exemplo n.º 14
0
def searchPageByKey():
    try:
        total = 0
        while True:
            docList = keyCollection.findPage({'state': 'CREATED'}, 0, 20)
            if docList == None or len(docList) == 0:
                break

            for doc in docList:

                total += 1
                print('total=' + str(total))

                pageList = []
                for offset in [0, 20, 40]:
                    url = SEARCH_PAGE_PATTERN.format(doc['title'], offset, 20)
                    errorCode, response = HttpHelper.get(url)
                    if errorCode != 'OK' or response == None or (not 'result'
                                                                 in response):
                        break

                    if not 'webPages' in response['result']:
                        break

                    webPages = response['result']['webPages']
                    if not 'value' in webPages:
                        break

                    value = webPages['value']
                    for item in value:
                        if 'name' in item and 'url' in item and 'snippet' in item:
                            page = {
                                'title': item['name'],
                                'url': item['url'],
                                'description': item['snippet'],
                                'state': 'CREATED',
                                'key': doc['title']
                            }

                            isBlack = False
                            for site in BLACK_SITE_LIST:
                                if site in item['url']:
                                    isBlack = True

                            if not isBlack:
                                pageList.append(page)

                if len(pageList) > 0:
                    doc['pageList'] = pageList
                    doc['state'] = 'PAGED'
                    print('search page by key, key={0}, found={1}'.format(
                        doc['title'], len(pageList)))
                else:
                    doc['state'] = 'CLOSED'
                    print('search page by key, key={0}, closed'.format(
                        doc['title']))
                keyCollection.updateOne(doc)

                if len(pageList) > 0:
                    pageCollection.insertMany(pageList)

                time.sleep(1)

    except Exception as err:
        print(err)
Exemplo n.º 15
0
def searchKeyBySeed():
    try:
        total = 0
        while True:
            docList = seedCollection.findPage({'state': 'CREATED'}, 0, 20)
            if docList == None or len(docList) == 0:
                break

            for doc in docList:
                try:
                    print('seed={0}'.format(doc['title']))
                    url = SEARCH_KEY_PATTERN.format(doc['title'], 0, 10)
                    errorCode, response = HttpHelper.get(url)
                    if errorCode != 'OK' or response == None:
                        continue

                    if (not 'result' in response):
                        raise Exception('result not found')
                        continue

                    result = response['result']
                    if (not 'relatedSearches' in result) \
                        or (not 'webPages' in result):
                        raise Exception(
                            'relatedSearches or webPages not found')
                        continue

                    relatedSearches = result['relatedSearches']
                    if not 'value' in relatedSearches:
                        raise Exception('value not found')
                        continue

                    webPages = result['webPages']
                    if not 'totalEstimatedMatches' in webPages:
                        raise Exception('totalEstimatedMatches not found')
                        continue

                    value = relatedSearches['value']
                    newKeyList = []
                    for item in value:
                        if 'text' in item:
                            newKeyList.append(item['text'])

                    for key in newKeyList:
                        if keyCollection.findOneByFilter({'title':
                                                          key}) == None:
                            keyCollection.insertOne({
                                'title':
                                key,
                                'state':
                                'CREATED',
                                'level':
                                doc['level'] + 1,
                                'parent':
                                doc['title'],
                                'matched':
                                webPages['totalEstimatedMatches']
                            })

                    doc['state'] = 'KEYED'
                    seedCollection.updateOne(doc)

                    total += 1
                    print('total={0}, key={1}'.format(total, key))

                    time.sleep(1)

                except Exception as err:
                    doc['state'] = 'CLOSED'
                    seedCollection.updateOne(doc)
                    print(err)

    except Exception as err:
        print(err)
Exemplo n.º 16
0
from dn_helper import DNHelper
from http_helper import HttpHelper
import requests
import json
import time
import codecs
start = 320776
history_reader = open("history_data.txt", "r")
history_writer = codecs.open("history_data.txt", "a", 'utf8')
history_datas = filter(
    lambda x: x,
    map(lambda x: x.strip('\n').split('\t'), history_reader.readlines()))
history_titles = [x[0] for x in history_datas]
history_links = [x[1] for x in history_datas]
dn_helper = DNHelper()
http_helper = HttpHelper()
notice_template = u"新公告:%s\n 链接:%s"

debug = False
first_run = True
if not debug and not first_run:
    group_ids = [655514756, 584050850]
else:
    group_ids = [655514756]


def calc_hammer_distance(s1, s2):
    dp = [[0x3f3f3f3f for i in range(len(s2) + 1)] for j in range(len(s1) + 1)]
    dp[0][0] = 0
    for i in range(len(s1) + 1):
        for j in range(len(s2) + 1):
Exemplo n.º 17
0
#coding: utf8
import json

from wsgiref.simple_server import make_server
from dn_data_extractor import dn_data_extract
from http_helper import HttpHelper
hh = HttpHelper()


def process_dn_data_message(msg, text):
    image_path = dn_data_extract(msg["group_id"], text)
    if not image_path:
        print 'get image path failed msg = %s' % text
    elif isinstance(image_path, list):
        hh.send_group_msg(msg['group_id'], '\n'.join(image_path))
    elif image_path.startswith('text:'):
        hh.send_group_msg(msg['group_id'], image_path[5:])
    else:
        hh.send_group_image(msg['group_id'], image_path)
    return 'success'


def application(environ, start_response):
    print 'get'
    start_response('200 OK', [('Content-Type', 'text/html')])
    request_body_size = int(environ.get('CONTENT_LENGTH', 0))
    msg = json.loads(environ['wsgi.input'].read(request_body_size))
    if "message" not in msg:
        return "success"
    text = msg['message']
    if text.startswith('.dn') or text.startswith('.test') or text.startswith(