def getDateInfo(filename): regexes = [] totalhtml = "" count = 1 linenum = 0 lines = dcutils.loadLines(filename) try: printQuestion("Interactively Learning Date Formats.") printedInstructions = False for line in lines: line = line.strip() linenum += 1 prettyLine = formatLine(line, 100, "\t") if not possiblyHasTimeStamp(line): print "Skipping unpromissing line " + str(linenum) + "." else: timestamp = guessTimestamp(regexes, line) if timestamp != None: print "Parsed Date on line " + str(linenum) + "." # Time = ", timestamp, "\n", prettyLine continue if not printedInstructions: printedInstructions = True print Instruction # print "\n" + "\nUnable to get time on this line:\n" + "-"*80 + "\n" + prettyLine + "\n" + "-"*80 print "\nSAMPLE LINE " + str(linenum) + ":\n" + prettyLine + "\n" + "-" * 80 # if askYesNoQuestion("Did we parse this correctly?"): continue while True: timeformat = prompt( "timestamp values as: month, day, year, hour, minute, second, ampm, timezone.\n\t" ) if timeformat == "": break # user says there is no timestamp on this line else: timevalues = [v.lower().strip() for v in timeformat.split(",")] formatname = genformatname(filename, count) html, regex = learnrobRegex(formatname, line, timevalues) if html != None: print "Learned pattern." # print html # print regex if not regex in regexes: totalhtml += html count += 1 regexes.append(regex) break else: print "Unable to learn pattern. Enter the timestamp values again. If there is no timestamp on this line, just hit Enter." printQuestion("If you are satisfied that the timestamps formats have been learned, hit Control-C.") except KeyboardInterrupt: pass except Exception: import traceback traceback.print_exc() return regexes, totalhtml
def getDateInfo(filename): regexes = [] totalhtml = "" count = 1 linenum = 0 lines = dcutils.loadLines(filename) try: printQuestion("Interactively Learning Date Formats.") printedInstructions = False for line in lines: line = line.strip() linenum += 1 prettyLine = formatLine(line, 100, "\t") if not possiblyHasTimeStamp(line): print "Skipping unpromissing line " + str(linenum) + "." else: timestamp = guessTimestamp(regexes, line) if timestamp != None: print "Parsed Date on line " + str(linenum) + "." # Time = ", timestamp, "\n", prettyLine continue if not printedInstructions: printedInstructions = True print Instruction #print "\n" + "\nUnable to get time on this line:\n" + "-"*80 + "\n" + prettyLine + "\n" + "-"*80 print "\nSAMPLE LINE " + str(linenum) + ":\n" + prettyLine + "\n" + "-"*80 #if askYesNoQuestion("Did we parse this correctly?"): continue while (True): timeformat = prompt("timestamp values as: month, day, year, hour, minute, second, ampm, timezone.\n\t") if timeformat == "": break # user says there is no timestamp on this line else: timevalues = [v.lower().strip() for v in timeformat.split(",")] formatname = genformatname(filename, count) html, regex = learnrobRegex(formatname, line, timevalues) if html != None: print "Learned pattern." #print html #print regex if not regex in regexes: totalhtml += html count += 1 regexes.append(regex) break else: print "Unable to learn pattern. Enter the timestamp values again. If there is no timestamp on this line, just hit Enter." printQuestion("If you are satisfied that the timestamps formats have been learned, hit Control-C.") except KeyboardInterrupt: pass except Exception: import traceback traceback.print_exc() return regexes, totalhtml
def printLineSamplings(filenames, maxPerFile = 10, searchKey = None): import random for fn in filenames: lines = dcutils.loadLines(fn) print "\nSample lines from", fn, "\n------------------------------------------------------------------------" count = 1 for line in lines: line = line.strip() if random.randint(1,10) == 1 and len(line) > 20 and (searchKey == None or searchKey in line): print "\t", line count += 1 if count > maxPerFile: break print
def learnFieldRulesFromFile(filename, fieldname, goodTerms, badTerms, maxLines, first): filetype = getFileType(filename) _printTiming("Got filetype: " + filetype) lines = dcutils.loadLines(filename) if len(lines) > maxLines: lines = lines[:maxLines] if first: print "Large training file. Limiting learning to first", maxLines, "lines of", filename print "Learning..." _printTiming("Loaded lines") # strictRules = _generateRules(filetype, fieldname, lines, goodTerms, None) looseRules = _generateRules(filetype, fieldname, lines, goodTerms, "Loose") #rules = strictRules + looseRules rules = looseRules _printTiming("Generated lines") if _debug > 0: print "Rules Generated:", len(rules) newTerms = _validateRules(lines, goodTerms, badTerms, rules) _printTiming("Validated lines") if _debug > 0: print "Rules Approved:", len(rules) #if rulesdict != None and len(rulesfile) > 0: # saveRules(rulesfile, rules) return rules, newTerms
argv = sys.argv rulesfile = "rules.xml" if argc == 6: filename = argv[1] fieldname = argv[2] rulesfile = argv[3] goodstr = argv[4] badstr = argv[5] goodterms = set([v.strip() for v in goodstr.split(",")]) badterms = set([v.strip() for v in badstr.split(",")]) #rules, newterms = learnFieldRulesFromFile(rulesfile, filename, fieldname, goodterms, badterms) rules, newterms = interactivelyLearn(filename, fieldname, goodterms, badterms, 5, 10000) print len(rules), "rules" print "Terms: ", newterms elif argc == 3: filename = argv[1] rulesfile = argv[2] filetype = getFileType(filename) rulesdict = {} lines = dcutils.loadLines(filename) for line in lines: extractions = getExtractions(rulesdict, filetype, line) if extractions != None: print line print "\t", extractions else: print 'Usage \n' print '\tTo Train: \t' + argv[0] + ' <file> <fieldname> <rulesfile. empty "" to not save> "<good terms comma separated>" "<bad terms comma separated>"' print '\tTo Run: \t' + argv[0] + ' <file> <rulesfile>'
if argc == 6: filename = argv[1] fieldname = argv[2] rulesfile = argv[3] goodstr = argv[4] badstr = argv[5] goodterms = set([v.strip() for v in goodstr.split(",")]) badterms = set([v.strip() for v in badstr.split(",")]) #rules, newterms = learnFieldRulesFromFile(rulesfile, filename, fieldname, goodterms, badterms) rules, newterms = interactivelyLearn(filename, fieldname, goodterms, badterms, 5, 10000) print len(rules), "rules" print "Terms: ", newterms elif argc == 3: filename = argv[1] rulesfile = argv[2] filetype = getFileType(filename) rulesdict = {} lines = dcutils.loadLines(filename) for line in lines: extractions = getExtractions(rulesdict, filetype, line) if extractions != None: print line print "\t", extractions else: print 'Usage \n' print '\tTo Train: \t' + argv[ 0] + ' <file> <fieldname> <rulesfile. empty "" to not save> "<good terms comma separated>" "<bad terms comma separated>"' print '\tTo Run: \t' + argv[0] + ' <file> <rulesfile>'
# next best are equally weird. using pattern at start of line before break or end of line after break elif len(beforeEnd) > 0: print "LINE_BREAKER = %s" % makeRegex(list(beforeEnd)[0], False) elif len(afterStart) > 0: print "LINE_BREAKER = %s" % makeRegex(list(afterStart)[0], True) if __name__ == '__main__': import sys if len(sys.argv) != 2: print 'Usage: python %s "file of events"' % sys.argv[0] print ' file should break events with "-=X=-" on a separate line' else: events = [] filename = sys.argv[1] lines = dcu.loadLines(filename) if lines == []: print "cannot get events" exit(1) event = '' for line in lines: line = line.strip() if line == '-=X=-': if event != '': events.append(event) event = '' else: if event != '': event += '\n' event += line