示例#1
0
def main(argv):
    # get all files
    files = sql.fetch_all(None, None, "SELECT * FROM `raws`")

    for f in files:
        print("Processing %s" % f['filename'])
        raw_id = f['id']

        # preprocess
        lines = []
        try:
            line_nr = 0
            for line in f['data'].split("\r\n"):
                if len(line) == 0: continue
                line_nr += 1
                line = line.rstrip('\r\n')
                line = re.split(r'\t|;', line)
                line = preprocess_line(line)
                lines.append(line)
        except:
            print "%d: %s" % (line_nr, line)
            raise

        # fix!
        line_nr = 0
        try:
            for line in lines:
                line_nr += 1
                phenotype = format_line(line) # create a readable program

                # look up the phenotype id based on the line number
                try:
                    if (phenotype['entity_id'] == 808 and phenotype['value_id'] == 178): continue
                    phenotype_id = sql.fetch_all('phenotype_raws', { 'line_nr': line_nr, 'raw_id': raw_id })[0]['phenotype_id']
                except:
                    print "%d: %d" % (line_nr, raw_id)
                    raise

                # get the phenotype_%s % (plant, sample, aliquot) id, if any
                ph_plant = sql.fetch('phenotype_plants', phenotype_id, 'phenotype_id')
                ph_sample = sql.fetch('phenotype_samples', phenotype_id, 'phenotype_id')
                ph_aliquot = sql.fetch('phenotype_aliquots', phenotype_id, 'phenotype_id')

                # check where the link should belong and remove the others, if any
                if ora_sql.is_plant(phenotype['sample_id']) or ora_sql.was_plant(phenotype['sample_id']):
                    if ph_sample != False: print "DELETE FROM `phenotype_samples` WHERE id = %s;" % ph_sample['id']
                    if ph_aliquot != False: print "DELETE FROM `phenotype_aliquots` WHERE id = %s;" % ph_aliquot['id']
                elif ora_sql.is_sample(phenotype['sample_id']):
                    if ph_aliquot != False: print "DELETE FROM `phenotype_aliquots` WHERE id = %s;" % ph_aliquot['id']
                    if ph_plant != False: print "DELETE FROM `phenotype_plants` WHERE id = %s;" % ph_plant['id']
                elif ora_sql.is_aliquot(phenotype['sample_id']):
                    if ph_sample != False: print "DELETE FROM `phenotype_samples` WHERE id = %s;" % ph_sample['id']
                    if ph_plant != False: print "DELETE FROM `phenotype_plants` WHERE id = %s;" % ph_plant['id']
                else:
                    print "%s NOT found!!" % phenotype['sample_id']

        except:
            progress("%d: %s" % (line_nr, line))
            raise
示例#2
0
def countPending():
  results = fetch('select count(*) from unsubs')
  print results
  results = fetch('select count(hash) from unsubs')
  print results
  results = fetch('select count(distinct unsubhash) as b from anonymousanalytics group by emailhash order by b desc')
  #print results
  results = fetch('select count(distinct emailhash ) from anonymousanalytics ')
  print results
示例#3
0
def deleteReadEmail17days():
  results = fetch('select email from readmail')
  print results[-10:]
  
  start = 6000
  for r in results:
    commit('delete from readmail where email=%s',str(start))
    start += 1
  results = fetch('select email from readmail')
  print results[-10:]
示例#4
0
def deleteLastReads():
  results = fetch('select email from readmail')
  print results[-10:]
  
  total = len(results)
  
  for r in results[total-150:]:
    commit('delete from readmail where email=%s',r[0])
  results = fetch('select email from readmail')
  print results[-10:]
  
#deleteLastReads()
#deleteReadEmail17days()
示例#5
0
def printAnalytics():
    log.tid = newHash()
    results = fetch('select * from analytics')
    log.info('all analytics', results)
    results = fetch('select count(*) from unsubs')
    log.info('current unsubs', results)
    log.info('print analytics total, successful, all broken')
    results = fetch('select count(*) from analytics')
    log.info('total', results)
    results = fetch('select count(*) from analytics where success=1')
    log.info('successful', results)
    results = fetch('select email, url from analytics where success=0')
    log.info(results)
    log.info('success / not success for william.k.dvorak')
    log.info(getAnalyticsForEmail('*****@*****.**'))
示例#6
0
def getFive():
    # random order in case there's two slaves, don't likely grab the same unsub in high volume
    results = fetch(
        'select url, email, hash from unsubs order by RAND() limit 1')
    s = set()
    for r in results:
        s.add(str(r[2]))
    origSet = set(s)
    if not s:
        return [], origSet
    s = str(list(s)).replace('[', '(').replace(']', ')')
    results = fetch('select url, email, hash from unsubs where hash in ' + s)
    l = list()
    for r in results:
        l.append(UnSub(r[0], r[1], r[2]))
    return l, origSet
示例#7
0
def deleteAllUnsubs():
    results = fetch('select hash from unsubs')
    log.info('deleting all unsubs with # unsubs ' + str(len(results)))
    if len(results) < 15:
        for r in results:
            hh = r[0]
            commit('delete from unsubs where hash=%s', hh)
示例#8
0
def getAnalyticsForEmail(email):
    digest = hashEmail(email)
    results = fetch(
        'select count(*) from anonymousanalytics where emailhash=%s', digest)
    total = results[0][0]
    results = fetch(
        'select count(*) from anonymousanalytics where emailhash=%s and success=1',
        digest)
    successful = results[0][0]
    if email == 'admin':
        results = fetch('select count(*) from anonymousanalytics')
        total = results[0][0]
        results = fetch(
            'select count(*) from anonymousanalytics where success=1')
        successful = results[0][0]
    if email == 'admin24':
        now = str(datetime.datetime.now() - timedelta(hours=24))
        results = fetch(
            'select count(*) from anonymousanalytics where stamp > %s', now)
        total = results[0][0]
        results = fetch(
            'select count(*) from anonymousanalytics where success=1 and stamp > %s',
            now)
        successful = results[0][0]
    return [str(int(successful)), str(int(total) - int(successful))]
示例#9
0
def main(argv):

    parser = argparse.ArgumentParser(description='')
    parser.add_argument('files', nargs='+')
    parser.add_argument('--standortid', type=int)
    parser.add_argument('--pages', type=int, default=1)
    args = parser.parse_args(argv)
    
    for full_fn in args.files:
        fn = ntpath.basename(full_fn)
        # look up the file id
        file_id  = sql.fetch('ufiles', fn, id_key='name')
        if not file_id:
            print "File '%s' not found in DB, skipping" % fn
        else:
            data = []
            headers = []
            for page in xrange(args.pages):
                data, headers = p_xls.read_xls_data(full_fn, page)
                lines = 0 # administration: number of succesfully inserted lines
                for row in data:
                    standortid = -1
                    if hasattr(row, 'StandortID'):
                        standortid = getattr(row, 'StandortID')
                    elif args.standortid != None:
                        standortid = args.standortid
                    else:
                       sys.stderr.write('No StandortID found!')
                       exit()

                    rs = sql.fetch_all('temps', {
                        'datum': getattr(row, 'Datum'),
                        'location_id': standortid
                    })
                    if rs != False:# and len(rs) == 1:
                        for i in xrange(len(rs)):
                            if (sql.insert('ufiletemps', {
                                'ufile_id': file_id['id'],
                                'temp_id':  rs[i]['id']
                            })):
                                lines += 1
                            else:
                                print "%d,%d" % (file_id['id'], rs[i]['id'])
                print "Inserted %d/%d of page %d" % (lines, len(data), page)
            sql.commit() # after each file

    return None
示例#10
0
def allUnsuccessful():
  results = fetch('select email, url from analytics where success=0')
  ss = dict()
  for r in results:
    ss[str(r[1])] = str(r[0])
  i = 0
  low = 200
  high = low+10
  print len(ss.keys())
  for k,v in ss.iteritems():
    if i > low:
      os.system('open '+k)
      print v
      print k
    i+=1
    if i > high:
      break
示例#11
0
def anonymousAnalytics(email, unsubhash, success=False):
    digest = hashEmail(email)

    now = str(datetime.datetime.now())
    results = fetch(
        'select unsubhash, success from anonymousanalytics where unsubhash=%s',
        (unsubhash))
    success = int(success)
    if results:
        if int(results[0][1]) == 0 and success:
            commit(
                'update anonymousanalytics set success=1 where unsubhash=%s',
                (unsubhash))
        else:
            log.info('unsub hash is still failing, do not update analytics',
                     unsubhash)
    else:
        commit(
            'insert into anonymousanalytics (emailhash, unsubhash, success, stamp) values (%s, %s, %s, %s)',
            (digest, unsubhash, str(success), now))
示例#12
0
def handle(request):
    info = []

    #
    #		NEW JOB REQUESTED
    #
    if request[0] == 'n':
        print "Got a request for a new job"
	data = sql.fetch("SELECT * from openjobs WHERE jobtype > -1 AND tid = (SELECT max(tid));")[0]

	# Build the query to move job to pending table
	pending_q = ("INSERT INTO pendingjobs(tid, jobtype, jobdesc, jobdata) VALUES (%s, %s, '%s', '%s');" %
                (data[0], data[1], data[2], data[3]))

	# Build info with delimeters for transit
	for el in data:
		el = str(el)
		info.append(el)
		info.append("^")
	info.pop(-1)	# Remove the last item, as its a spare delimeter

	# Move job to pending table
	print "> Moving job with table ID [" + str(info[0]) + "] to the pending jobs table"
	res = sql.update(pending_q)	
	if res == -1:
		print ">> There was an error moving job to the pending table! Changes reverted"
	elif res == 1:
		print ">> Job moved to pending table"

	# Remove job from open table
	print "> Removing job with table ID [" + str(info[0]) + "] from open jobs";
	res1 = sql.update("delete from openjobs where tid = %s;" % str(info[0]))
	if res1 == -1:
		print ">> There was an error removing the job from the open table! Changes reverted"
	elif res == 1:
		print ">> Job removed from open table"

    #
    #		UPDATE TO JOB REQUESTED
    #
    if request[0] == 'u':
	# EXPECTING  :  u, tid, type, desc, data
	# If sorting : data = item,item,item;ORIGINAL_DESC;ORIGINAL_DATA
	# Insert data into closed
	print "Got a request to update job"
	print "> Moving job with table ID [" + str(request[1]) + "] to the closed table"
	sql.update("INSERT INTO closedjobs(tid, jobtype, jobdesc, jobdata) VALUES (%s, %s, '%s', '%s');" % 
			(request[1], request[2], request[3], request[4]))

	# Remove item from pending
        print "> Removing job with table ID [" + str(request[1]) + "] from the pending jobs";
        res1 = sql.update("delete from pendingjobs where tid = %s;" % str(request[1]))
        if res1 == -1:
                print ">> There was an error removing the job from the pending table! Changes reverted"
        elif res1 == 1:
                print ">> Job removed from pending table"

	# Thank the client
	info.append("skynet thanks you for your service")

    #
    #		REPORT REQUESTED
    #
    if request[0] == 'r':
        info.append("REQ FOR REPORT")
        print "Got a request for a report"

    #
    #		JOIN AND SEND DATA
    #
    reply = ''.join(info).replace('\t', ' ')
    reply += '\t'
    return reply
示例#13
0
def percentSuccess():
  results = fetch('select success, count(*) from anonymousanalytics group by success')
  print 'success ', results
  dt = datetime.datetime.now() - timedelta(days=20)
  results = fetch('select success, count(*) from anonymousanalytics  where stamp > %s group by success',str(dt)[:11])
  print 'success within 20 days', results
示例#14
0
def index():

    # Bootstrap alerts and errors that will popup on the top of the screen
    alerts = []
    errors = []

    # Arguments being passed through to the html page
    htmlArguments = {}

    # Checks if an error is detected in the HTML arguments (generally the text after '?' in the domain) and adds the information to the Bootstrap errors
    if 'error' in request.args:
        errors.append({
            'error': request.args['error'],
            'error_description': request.args['error_description']
        })

    # Checks if the user isn't logged in locally
    if not session.get("user"):
        # Creates a state for the session for the user
        session["state"] = str(uuid.uuid4())

        # Creates the OAuth2 redirect URL for the user to be logged into, which is passed through into the html arguments
        auth_url = _build_auth_url(scopes=app_config.SCOPE,
                                   state=session["state"])
        htmlArguments['auth_url'] = auth_url
    else:
        # Gets email of the user, and looks up user in the database
        emailOfUser = session["user"]["preferred_username"]
        databaseInfo = sql.fetch(emailOfUser).fetchone()

        # if user is not found in database or invalid refresh token
        if not databaseInfo or not databaseInfo[0]:
            # logs out user
            return redirect(url_for("logout"))

        # Checks if user requires SMS verification, by searching if user has phone number saved, but not verified
        requireSMSVerification = databaseInfo[1] and not databaseInfo[5]
        # Checks if the user wishes to receive Microsoft Teams notifications
        getTeamsNotifications = databaseInfo[2]
        # Checks if the user wishes to send and receive emails over SMS
        emailOverSMS = databaseInfo[4]

        # Prefills phone number on HTML form if phone number is already in the database
        if databaseInfo[1]:
            htmlArguments['prefilledPhoneNumber'] = databaseInfo[1]
        else:
            htmlArguments['prefilledPhoneNumber'] = ""

        # Checks if the user has made a POST request
        if request.method == 'POST':
            # Checks if the user pressed the update button
            if 'updateButton' in request.form:
                # Gets the phone number from the form
                phoneNumber = request.form['phoneNumber']

                # Gets the verification code from the form if required
                if requireSMSVerification:
                    verificationCodeFromUser = request.form[
                        'smsVerificationCode']

                    # Checks if user attempted entering a verification code
                    if verificationCodeFromUser:
                        # Clears verification code and sets phone as verified if verified code is correct
                        if verificationCodeFromUser == databaseInfo[6]:
                            sql.updateVal(emailOfUser, 'VerifiedPhone', True)
                            sql.updateVal(emailOfUser, 'VerificationCode',
                                          None)

                            requireSMSVerification = False

                            send(
                                "OfficeConnected: You have successfully connected your phone! Reply with 'CMD' to get a full list of commands you can do with OfficeConnected",
                                databaseInfo[1])
                        else:
                            # Tells user that verification code is wrong through Bootstrap
                            errors.append({
                                "error":
                                "Invalid SMS verification code",
                                "error_description":
                                "You have entered an invalid verification code, make sure you've typed the right characters. If you would like a new verification code, you can reply 'LINK' to the SMS message"
                            })

                # Checks if user is trying to update phone the phone number to a different one from database
                if databaseInfo[1] != phoneNumber and phoneNumber:
                    # Checks if updated phone number already exists in the database and tells user error through Bootstrap
                    if sql.fetchPhone(phoneNumber).fetchone():
                        errors.append({
                            "error":
                            "Phone number already exists",
                            "error_description":
                            "An account with that phone number already exists in our database, please enter a valid phone number, or to unlink that number, text 'UNLINK' to +1 (844)-961-2701"
                        })
                    else:
                        # Updates unverified phone number in database
                        sql.updateVal(emailOfUser, 'PhoneNumber', phoneNumber)
                        sql.updateVal(emailOfUser, 'VerifiedPhone', False)
                        sql.updateVal(emailOfUser, 'VerificationCode', None)

                        # Replace html argument to updated phone number
                        htmlArguments['prefilledPhoneNumber'] = phoneNumber

                        requireSMSVerification = True

                        # Notifying user over text and Bootstrap alert to verify phone number
                        send(
                            "OfficeConnected: Verify your phone by responding with the message 'LINK' to receive your verification code",
                            phoneNumber)
                        alerts.append(
                            "A message has been sent to your phone. Please verify your phone by responding with the message 'LINK' and entering your verification code"
                        )

                # Updates if the user wants to get Teams notifications based on if the getTeamsNotification checkbox is checked in HTML
                if 'getTeamsNotifications' in request.form and request.form[
                        'getTeamsNotifications'] == 'on':
                    getTeamsNotifications = True
                    sql.updateVal(emailOfUser, 'GetSMSTeamNotifications', True)
                else:
                    getTeamsNotifications = False
                    sql.updateVal(emailOfUser, 'GetSMSTeamNotifications',
                                  False)

                # Updates if the user wants to allow email over SMS based on if the emailOverSMS checkbox is checked in HTML
                if 'emailOverSMS' in request.form and request.form[
                        'emailOverSMS'] == 'on':
                    emailOverSMS = True
                    sql.updateVal(emailOfUser, 'EmailOverSMS', True)
                else:
                    emailOverSMS = False
                    sql.updateVal(emailOfUser, 'EmailOverSMS', False)

            # Checks if the deleteAccount button has been pressed, and clears user from database
            elif 'deleteAccount' in request.form:
                sql.delete(emailOfUser)
                return redirect(url_for("logout"))

        # sets respective HTML arguments to their variables on Python to be passed through in Flask
        htmlArguments['getTeamsNotificationsBool'] = getTeamsNotifications
        htmlArguments['emailOverSMSBool'] = emailOverSMS
        htmlArguments['requireSMSVerification'] = requireSMSVerification

        # Passes through basic user info to Flask
        htmlArguments['user'] = session['user']

    # Passes through Bootstrap alerts and errors to HTML
    htmlArguments['errors'] = errors
    htmlArguments['alerts'] = alerts

    # Renders the HTML, with htmlArguments as it's arguments
    return render_template('home.html', **htmlArguments)
示例#15
0
def numUnsubs():
    results = fetch('select hash from unsubs')
    num = len(results)
    return num
示例#16
0
def getProblem():
  hashh = 'lrnbgg19'
  results = fetch('select * from unsubs where hash="lrnbgg19"')
  print results
  results = fetch('select count(*) from unsubs ')
  print results
示例#17
0
def main(level='para', process_mode=''):
    tokenize = lambda doc: doc.lower().split(" ")
    if level == 'para':
        sql = "SELECT tipcands FROM answers"
        res = fetch(sql)
        docs = []
        for item in res:
            item = item['tipcands']
            item = item.replace('\n', '')
            item = item.replace('\r', '')
            item = item.replace('\r\n', '')
            item = item.strip()
            item = re.sub('(<p>|</p>)', '', item)
            item = re.sub('<[^>]*>', ' ', item)
            item = process_raw_txt(item, mode=process_mode)
            item = remove_puncs(item)
            docs.append(item)
        sklearn_tfidf = TfidfVectorizer(min_df=0, tokenizer=tokenize)
        sklearn_tfidf.fit(docs)
    else:
        sql = "SELECT tipcands FROM answers"
        res = fetch(sql)
        docs = []
        for item in res:
            item = item['tipcands']
            item = item.replace('\n', '')
            item = item.replace('\r', '')
            item = item.replace('\r\n', '')
            item = item.strip()
            item = re.sub('(<p>|</p>)', '', item)
            item = re.sub('<[^>]*>', ' ', item)
            sents = sent_tokenize(item)
            for sent in sents:
                sent = process_raw_txt(sent, mode=process_mode)
                sent = remove_puncs(sent)
                docs.append(sent)
        sklearn_tfidf = TfidfVectorizer(min_df=0, tokenizer=tokenize)
        sklearn_tfidf.fit(docs)

    f = open(home + 'datasets/{}_tip.pos'.format(level), 'r', encoding='utf-8')
    texts = f.readlines()
    f.close()
    pos_ids = [line.replace('\n', '').split('\t')[0] for line in texts]

    f = open(home + 'datasets/{}_tip.neg'.format(level), 'r', encoding='utf-8')
    texts = f.readlines()
    f.close()
    neg_ids = [line.replace('\n', '').split('\t')[0] for line in texts]

    f = open(home + 'datasets/{}_tip.ds'.format(level), 'r', encoding='utf-8')
    texts = f.readlines()
    f.close()
    texts = [line.replace('\n', '').split('\t') for line in texts]

    id2text = {idx: [remove_puncs(txt)] for idx, txt in texts}

    matrix = []
    for idx, txt in enumerate(texts):
        if level == 'para':
            sql = "SELECT * FROM paragraphs where `ID`=%s"
        elif level == 'sent':
            sql = "SELECT * FROM sentences where `ID`=%s"
        try:
            item = fetch(sql, txt[0])[0]
        except Exception as e:
            print(txt)

            print(fetch(sql, txt[0]))
            raise e
        parent_id = item['parent_id']
        temp = []
        temp.append(int(txt[0]))
        temp.append(item['score'])  #answer score
        if item['LastEditDate'] == '':
            adate = item['CreationDate']
        else:
            adate = item['LastEditDate']
        sql = "SELECT * FROM threads where `id` = %s"
        q = fetch(sql, item['parent_id'])
        if q[0]['LastEditDate'] == '':
            qdate = q[0]['CreationDate']
        else:
            qdate = q[0]['LastEditDate']
        adate = datetime.strptime(adate.split('.')[0], "%Y-%m-%dT%H:%M:%S")
        qdate = datetime.strptime(qdate.split('.')[0], "%Y-%m-%dT%H:%M:%S")
        diff = adate - qdate
        adiff = diff.total_seconds() / 3600.0
        temp.append(adiff)  #answer time difference to question
        now = datetime.strptime(datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
                                "%Y-%m-%dT%H:%M:%S")
        aage = (now - adate).total_seconds() / 3600.0
        temp.append(aage)  #answer age
        temp.append(q[0]['score'])  #question score
        temp.append(q[0]['FavoriteCount'])  #question favorites
        sql = "SELECT * FROM users where `ID` = %s"
        r = fetch(sql, item['OwnerUserId'])
        if len(r) > 0:
            temp.append(r[0]['Reputation'])  #question use reputations
        else:
            temp.append(0)
        temp.append(q[0]['ViewCount'])  #question views
        temp.append((now - qdate).total_seconds() / 3600.0)  #question age
        tokens = id2text[txt[0]][0].split()
        temp.append(len(tokens))  #number of tokens in sentence
        sql = "SELECT * FROM answer_body where `ID` = %s"
        body = fetch(sql, item['answer_id'])[0]['body']
        cnt = 0
        try:
            m = re.findall(
                r'(?:<blockquote>(?:(?!\n<p>|\n<ul>|\n<a href|\n<ol>|\n<h1>|\n<h2>).)*</blockquote>)',
                body, re.DOTALL)
            for s in m:
                body = body.replace(s, '')
            lis = re.findall(
                r'(?:<pre%s*><code>%s*</code></pre>)|(?:<p>%s*</p>)|(?:<ol>%s*</ol>)|(?:<ul>%s*</ul>)'
                % (c, c, c, c, c), body, re.DOTALL)
            l = len(lis)
            i = 0
            paras = []
            while i < l:
                if not lis[i].startswith('<pre'):
                    code = []
                    trigger = False
                    for j in range(i + 1, l):
                        trigger = True
                        if lis[j].startswith('<pre'):
                            code.append(lis[j])
                        else:
                            break
                    m = '\n$$$$$\n'.join(code)
                    paras.append([lis[i], m])
                    if trigger:
                        i = j
                    else:
                        i = i + 1
                else:
                    i = i + 1

            for unit in paras:
                para = unit[0]
                para = para.replace('\n', '')
                para = para.replace('\r', '')
                para = para.replace('\r\n', '')
                para = para.strip()
                para = re.sub('(<p>|</p>)', '', para)
                para = re.sub('<[^>]*>', ' ', para)
                cnt = cnt + len(para.split())
        except Exception as e:
            raise e
        temp.append(cnt)  #answer size
        question = []
        sql = "SELECT * FROM threads WHERE `id` =%s"
        res = fetch(sql, parent_id)[0]
        try:
            body = res['Body'].lower()
            m = re.findall(
                r'(?:<blockquote>(?:(?!\n<p>|\n<ul>|\n<a href|\n<ol>|\n<h1>|\n<h2>).)*</blockquote>)',
                body, re.DOTALL)
            for s in m:
                body = body.replace(s, '')
            lis = re.findall(
                r'(?:<pre%s*><code>%s*</code></pre>)|(?:<p>%s*</p>)|(?:<ol>%s*</ol>)|(?:<ul>%s*</ul>)'
                % (c, c, c, c, c), body, re.DOTALL)
            l = len(lis)
            i = 0
            paras = []
            while i < l:
                if not lis[i].startswith('<pre'):
                    code = []
                    trigger = False
                    for j in range(i + 1, l):
                        trigger = True
                        if lis[j].startswith('<pre'):
                            code.append(lis[j])
                        else:
                            break
                    m = '\n$$$$$\n'.join(code)
                    paras.append([lis[i], m])
                    if trigger:
                        i = j
                    else:
                        i = i + 1
                else:
                    i = i + 1
            for unit in paras:
                para = unit[0]
                para = para.replace('\n', '')
                para = para.replace('\r', '')
                para = para.replace('\r\n', '')
                para = para.strip()
                para = re.sub('(<p>|</p>)', '', para)
                para = re.sub('<[^>]*>', ' ', para)
                # para = ' '.join(para.split())
                sents = sent_tokenize(para)
                sents = [remove_puncs(sent)\
                    for sent in sents if len(sent.split()) > 5]
                if len(sents) > 0:
                    question = question + sents
                title = res['Title'].lower()
                question.append(remove_puncs(title))
        except Exception as e:
            raise e
        doc = item['tipcands']
        if level == 'para':
            para = item['tipcands']
            para = para.replace('\n', '')
            para = para.replace('\r', '')
            para = para.replace('\r\n', '')
            para = para.strip()
            para = re.sub('(<p>|</p>)', '', para)
            para = re.sub('<[^>]*>', ' ', para)
            para = ' '.join(para.split())
            sents = sent_tokenize(para)
            sents = [remove_puncs(sent)\
             for sent in sents if len(sent.split()) > 5]
            s = 0.0
            for doc0 in question:
                for doc1 in sents:
                    vec0 = sklearn_tfidf.transform([doc0]).toarray()[0]
                    vec1 = sklearn_tfidf.transform([doc1]).toarray()[0]
                    s = s + cosine_similarity(vec0, vec1)
            if s != 0.0:
                s = s / (len(question) * len(sents))
            temp.append(s)
        else:
            doc1 = remove_puncs(doc)
            s = 0.0
            for doc0 in question:
                vec0 = sklearn_tfidf.transform([doc0]).toarray()[0]
                vec1 = sklearn_tfidf.transform([doc1]).toarray()[0]
                s = s + cosine_similarity(vec0, vec1)
            if s != 0.0:
                s = s / len(question)
            temp.append(s)

        postags = [
            pair[1]
            for pair in nltk.pos_tag(word_tokenize(' '.join(id2text[txt[0]])))
        ]
        counter = collections.Counter(postags)
        if counter["NN"] > 0:
            temp.append(counter["NN"])  #number of nouns
        else:
            temp.append(0)
        if postags[0] == "NN":
            temp.append(1)  #sentence starts with noun
        else:
            temp.append(0)

        codes = re.findall(r'(?:<code>(?:(?!<code>).)*</code>)', doc,
                           re.DOTALL)
        num = sum([
            len(code.replace('<code>', '').replace('</code>', ''))
            for code in codes
        ])
        temp.append(num)  #number of characters that are code
        counter = collections.Counter(tokens)
        temp.append(counter['be'])
        matrix.append(temp)

    print(np.array(matrix).shape)

    features = {}
    for f in matrix:
        ids = str(int(f[0]))
        docs = id2text[ids]
        features[ids] = [f[1:], docs]

    cv = 10

    def build_data_cv(cv=cv):
        """
        Loads data and split into 10 folds.
        """
        process_mode = ''
        with open(
                home +
                'datasets/cv-{}-{}-dataset.pickle'.format(level, process_mode),
                'rb') as f:
            revs = pickle.load(f)
        temp = []
        for item in revs:
            ids = item['id']
            fset = features[ids][0]
            txt = features[ids][1]
            datum = {
                "y": item['y'],
                # "text": txt,
                "id": ids,
                "fset": fset,
                "split": item['split']
            }
            temp.append(datum)
        return temp

    entirefset = build_data_cv()
    with open(
            home + 'datasets/feature_set/{}-{}-sise-entirefset.pickle'.format(
                level, process_mode), 'wb') as f:
        pickle.dump(entirefset, f)
示例#18
0
    r'.*<code>.*\b%s\b.*</code>.*' % key,\
    r'.*<a.*href.*%s\.php.*>.*</a>.*' % key]
    for i in range(len(re_body)):
        regexp = re.compile(re_body[i])
        if regexp.search(str_body):
            is_found = True
            break
    return is_found


posts = TextLoader(home + 'raw_data/Posts.xml')
keywords = posts.read(home + 'keywords.ls')
keywords = set(keywords)

sql = "SELECT `parentid`, `acceptedanswerid` FROM kws2TID"
f = fetch(sql)
threadsid = [ids['parentid'] for ids in f]
acceptedids = [
    ids['acceptedanswerid'] for ids in f if ids['acceptedanswerid'] != -1
]
acceptedids = set(acceptedids)


def partitions(pmids, n):
    "Partitions the pmids into n subsets"
    nodes_iter = iter(pmids)
    while True:
        partition = tuple(itertools.islice(nodes_iter, n))
        if not partition:
            return
        yield partition
示例#19
0
def main(level='para', process_mode='', mode='static', word_vectors='nonrand'):
    print('---{}---{}---'.format(level, process_mode))
    f = open(home + 'onehot-{}_paris_cls_id.txt'.format(level), 'r')
    outs = f.readlines()
    f.close()
    one_hot_paris_cls_id = {
        line.replace('\n', '').split(' ')[1]: line.replace('\n',
                                                           '').split(' ')[0]
        for line in outs
    }
    f = open(
        home + 'normal-{}-{}-{}-{}_paris_cls_id.txt'.format(
            level, mode, word_vectors, process_mode), 'r')
    outs = f.readlines()
    f.close()
    normal_paris_cls_id = {
        line.replace('\n', '').split(' ')[1]: line.replace('\n',
                                                           '').split(' ')[0]
        for line in outs
    }

    with open(
            home + 'datasets/feature_map/{}-{}-{}-{}-fmap.pickle'.format(
                level, mode, word_vectors, process_mode), 'rb') as handle:
        cnn_fmap = pickle.load(handle)

    with open(
            home + 'datasets/feature_set/{}-{}-sise-entirefset.pickle'.format(
                level, process_mode), 'rb') as handle:
        temp = pickle.load(handle)
    sise_fs = {}
    for entity in temp:
        sise_fs[entity['id']] = entity['fset']
    with open(home + 'datasets/w2v_model/wv_{}.pickle'.format(process_mode),
              'rb') as handle:
        vocab = pickle.load(handle)
    embeddings_index = {}
    for word in vocab:
        embeddings_index[word] = vocab[word]
    for word in embeddings_index:
        img_cols = len(embeddings_index[word])
        break

    tokenize = lambda doc: doc.lower().split(" ")
    if level == 'para':
        sql = "SELECT tipcands FROM answers"
        res = fetch(sql)
        docs = []
        for item in res:
            item = item['tipcands']
            item = item.replace('\n', '')
            item = item.replace('\r', '')
            item = item.replace('\r\n', '')
            item = item.strip()
            item = re.sub('(<p>|</p>)', '', item)
            item = re.sub('<[^>]*>', ' ', item)
            item = process_raw_txt(item, mode=process_mode)
            item = remove_puncs(item)
            docs.append(item)
        sklearn_tfidf = TfidfVectorizer(min_df=0, tokenizer=tokenize)
        sklearn_tfidf.fit(docs)
    else:
        sql = "SELECT tipcands FROM answers"
        res = fetch(sql)
        docs = []
        for item in res:
            item = item['tipcands']
            item = item.replace('\n', '')
            item = item.replace('\r', '')
            item = item.replace('\r\n', '')
            item = item.strip()
            item = re.sub('(<p>|</p>)', '', item)
            item = re.sub('<[^>]*>', ' ', item)
            sents = sent_tokenize(item)
            for sent in sents:
                sent = process_raw_txt(sent, mode=process_mode)
                sent = remove_puncs(sent)
                docs.append(sent)
        sklearn_tfidf = TfidfVectorizer(min_df=0, tokenizer=tokenize)
        sklearn_tfidf.fit(docs)
    idf = sklearn_tfidf.idf_
    vocabulary = sklearn_tfidf.vocabulary_

    templates = []
    path = home + 'datasets/ngrams/'
    for (path, dirs, files) in os.walk(path):
        for file in files:
            fp = open(path + file)
            output = fp.readlines()
            fp.close()
            templates = templates + [
                item.split('\t')[0].replace('\n', '') for item in output
            ]

    f = open(home + 'datasets/{}_tip.pos'.format(level), 'r', encoding='utf-8')
    texts = f.readlines()
    f.close()
    pos_ids = [line.replace('\n', '').split('\t')[0] for line in texts]

    f = open(home + 'datasets/{}_tip.neg'.format(level), 'r', encoding='utf-8')
    texts = f.readlines()
    f.close()
    neg_ids = [line.replace('\n', '').split('\t')[0] for line in texts]

    f = open(home + 'datasets/{}_tip.ds'.format(level), 'r', encoding='utf-8')
    texts = f.readlines()
    f.close()
    texts = [line.replace('\n', '').split('\t') for line in texts]

    id2text = {idx: [remove_puncs(txt)] for idx, txt in texts}
    id2text_with_puncs = {idx: txt for idx, txt in texts}
    train_set = [remove_puncs(item[1]) for item in texts]
    unigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        ngram_range=(1, 1), min_df=5)
    unigram_vectorizer.fit(train_set)
    bigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        ngram_range=(2, 2), min_df=5)
    bigram_vectorizer.fit(train_set)
    trigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        ngram_range=(3, 3), min_df=5)
    trigram_vectorizer.fit(train_set)

    postagged_texts = []
    for _, txt in texts:
        postagged_texts.append(' '.join([pair[1] for pair in \
                nltk.pos_tag(word_tokenize(remove_puncs(txt)))]).lower())
    pos_unigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        ngram_range=(1, 1), min_df=5)
    pos_unigram_vectorizer.fit(postagged_texts)
    pos_bigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        ngram_range=(2, 2), min_df=5)
    pos_bigram_vectorizer.fit(postagged_texts)
    pos_trigram_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
        ngram_range=(3, 3), min_df=5)
    pos_trigram_vectorizer.fit(postagged_texts)

    matrix = []
    for idx, txt in enumerate(texts):
        if level == 'para':
            sql = "SELECT * FROM paragraphs where `ID`=%s"
        elif level == 'sent':
            sql = "SELECT * FROM sentences where `ID`=%s"
        try:
            item = fetch(sql, txt[0])[0]
        except Exception as e:
            print(txt)

            print(fetch(sql, txt[0]))
            raise e
        temp = []
        temp.append(int(txt[0]))
        #TGrams
        ngram = unigram_vectorizer.transform(id2text[txt[0]])
        temp.append(sum(ngram.tocoo().data))
        ngram = bigram_vectorizer.transform(id2text[txt[0]])
        temp.append(sum(ngram.tocoo().data))
        ngram = trigram_vectorizer.transform(id2text[txt[0]])
        temp.append(sum(ngram.tocoo().data))
        #POSGrams
        tags = [
            ' '.join([
                pair[1] for pair in nltk.pos_tag(
                    word_tokenize(' '.join(id2text[txt[0]])))
            ]).lower()
        ]
        ngram = pos_unigram_vectorizer.transform(tags)
        temp.append(sum(ngram.tocoo().data))
        ngram = pos_bigram_vectorizer.transform(tags)
        temp.append(sum(ngram.tocoo().data))
        ngram = pos_trigram_vectorizer.transform(tags)
        temp.append(sum(ngram.tocoo().data))
        #Surface
        tokens = id2text[txt[0]][0].split()
        temp.append(len(tokens))
        #template
        ids = 0
        num_t = 0
        wc_pos = 0
        maximum = 0
        for jdx, t in enumerate(templates):
            no = t.replace(' ', '\\s')
            no = no.replace('*', '[\w|\']+')
            regexp = re.compile(no)
            if regexp.search(item['tipcands']):
                ids = ids + (jdx + 1)
                num_t = num_t + 1
                wc_pos = wc_pos + (t.split(' ').index('*') + 1)
                if len(t.split(' ')) > maximum:
                    maximum = len(t.split(' '))
        temp.append(ids)
        temp.append(num_t)
        temp.append(wc_pos)
        temp.append(maximum)
        temp = temp + list(sise_fs[txt[0]])
        temp.append(int(one_hot_paris_cls_id[txt[0]]))
        temp.append(int(normal_paris_cls_id[txt[0]]))
        #W2V
        cnt = 0
        cnt1 = 0
        uniw2v = np.zeros(img_cols)
        idfw2v = np.zeros(img_cols)
        for token in tokens:
            if token in embeddings_index:
                uniw2v = np.add(uniw2v, embeddings_index[token])
                if token in vocabulary:
                    idfw2v = np.add(
                        idfw2v,
                        idf[vocabulary[token]] * embeddings_index[token])
                    cnt1 = cnt1 + idf[vocabulary[token]]
                cnt = cnt + 1
        if cnt != 0:
            uniw2v = 1 / cnt * uniw2v
            idfw2v = 1 / cnt1 * idfw2v
        temp = temp + list(uniw2v) + list(idfw2v) + list(cnn_fmap[txt[0]])

        matrix.append(temp)

    print(np.array(matrix).shape)

    features = {}
    for f in matrix:
        ids = str(int(f[0]))
        docs = id2text[ids]
        features[ids] = [f[1:], docs]

    cv = 10

    def build_data_cv(cv=cv):
        """
        Loads data and split into 10 folds.
        """
        with open(
                home +
                'datasets/cv-{}-{}-dataset.pickle'.format(level, process_mode),
                'rb') as f:
            revs = pickle.load(f)
        temp = []
        for item in revs:
            ids = item['id']
            fset = features[ids][0]
            txt = features[ids][1]
            datum = {
                "y": item['y'],
                # "text": txt,
                "id": ids,
                "fset": fset,
                "split": item['split']
            }
            temp.append(datum)
        return temp

    entirefset = build_data_cv()
    with open(
            home + 'datasets/feature_set/{}-{}-trip-entirefset.pickle'.format(
                level, process_mode), 'wb') as f:
        pickle.dump(entirefset, f)
示例#20
0
def main(argv):
    # get all files
    files = sql.fetch_all(None, None, "SELECT * FROM `raws`")

    for f in files:
        print("Processing %s" % f['filename'])
        raw_id = f['id']

        # preprocess
        lines = []
        try:
            line_nr = 0
            for line in f['data'].split("\r\n"):
                if len(line) == 0: continue
                line_nr += 1
                line = line.rstrip('\r\n')
                line = re.split(r'\t|;', line)
                line = preprocess_line(line)
                lines.append(line)
        except:
            print "%d: %s" % (line_nr, line)
            raise

        # fix!
        line_nr = 0
        try:
            for line in lines:
                line_nr += 1
                phenotype = format_line(line)  # create a readable program

                # look up the phenotype id based on the line number
                try:
                    if (phenotype['entity_id'] == 808
                            and phenotype['value_id'] == 178):
                        continue
                    phenotype_id = sql.fetch_all('phenotype_raws', {
                        'line_nr': line_nr,
                        'raw_id': raw_id
                    })[0]['phenotype_id']
                except:
                    print "%d: %d" % (line_nr, raw_id)
                    raise

                # get the phenotype_%s % (plant, sample, aliquot) id, if any
                ph_plant = sql.fetch('phenotype_plants', phenotype_id,
                                     'phenotype_id')
                ph_sample = sql.fetch('phenotype_samples', phenotype_id,
                                      'phenotype_id')
                ph_aliquot = sql.fetch('phenotype_aliquots', phenotype_id,
                                       'phenotype_id')

                # check where the link should belong and remove the others, if any
                if ora_sql.is_plant(
                        phenotype['sample_id']) or ora_sql.was_plant(
                            phenotype['sample_id']):
                    if ph_sample != False:
                        print "DELETE FROM `phenotype_samples` WHERE id = %s;" % ph_sample[
                            'id']
                    if ph_aliquot != False:
                        print "DELETE FROM `phenotype_aliquots` WHERE id = %s;" % ph_aliquot[
                            'id']
                elif ora_sql.is_sample(phenotype['sample_id']):
                    if ph_aliquot != False:
                        print "DELETE FROM `phenotype_aliquots` WHERE id = %s;" % ph_aliquot[
                            'id']
                    if ph_plant != False:
                        print "DELETE FROM `phenotype_plants` WHERE id = %s;" % ph_plant[
                            'id']
                elif ora_sql.is_aliquot(phenotype['sample_id']):
                    if ph_sample != False:
                        print "DELETE FROM `phenotype_samples` WHERE id = %s;" % ph_sample[
                            'id']
                    if ph_plant != False:
                        print "DELETE FROM `phenotype_plants` WHERE id = %s;" % ph_plant[
                            'id']
                else:
                    print "%s NOT found!!" % phenotype['sample_id']

        except:
            progress("%d: %s" % (line_nr, line))
            raise