def run(self): logger = myLogger.myLogger("parsing_thread_log"); linkedUrls = myLogger.myLogger("linked_urls_log", format='data'); parser = myParser(myurl = self.site.url, domain = self.site.domain); parser.parse(self.html); try: ## run the parser on the html from this page.## pass; except Exception, e: logger.error('urlReader line 188:'+str(e)); self.myDBConn.cursor.execute("UPDATE sites_sitequeue SET status=400, crawled=1 WHERE id = %d;" %doc_id); return;
def __init__(self, token, dictionary, output, mode): logger = myLogger.myLogger("Input layer initializer") logger.info("Initializing input raw") self.input = map( lambda call: map( lambda sentence: map(lambda x: dictionary.token2id[x], sentence ), call), token) self.output, self.act_dict, self.slot_dict = label_dict(output) self.sentence_length = 0 self.sentence_count = 0 for session in self.input: self.sentence_count = max(self.sentence_count, len(session)) for sentence in session: self.sentence_length = max(self.sentence_length, len(sentence)) # 初始化ndarray self.input_mtr = np.zeros( (len(self.input), self.sentence_count, self.sentence_length)) self.output_mtr = np.zeros( (len(self.input), self.sentence_count, len(self.act_dict))) for session_index in range(0, len(self.input)): for sentence_index in range(0, len(self.input[session_index])): # 此处仅记录act的label for n in range(0, len(self.input[session_index][sentence_index])): self.input_mtr[session_index][sentence_index][ n] = self.input[session_index][sentence_index][n] if mode == 1: for n in self.output[session_index]["act"][sentence_index]: self.output_mtr[session_index][sentence_index][n] = 1 elif mode == 2: for n in self.output[session_index]["slot"][ sentence_index]: self.output_mtr[session_index][sentence_index][n] = 1
def dictionary_initializer(token): """ Build dictionary with token :param token: :return: """ logger = myLogger.myLogger("Dictionary initializer") logger.info("Starting building dictionary") raw = map(lambda element: reduce(lambda x, y: x + y, element), token) dictionary = corpora.Dictionary(raw) logger.info("Finish building dictionary") return dictionary
def token_initializer(data): """ Translate text from input into token :param data: :return: """ logger = myLogger.myLogger("Token initializer") logger.info("Starting tokenizing") token = map( lambda element: map(lambda x: nltk.word_tokenize(x.lower()), element), data) logger.info("Tokenizing finished") return token
def raw_initializer(dataset): """ Read input and output information from json object :param dataset: :return: """ logger = myLogger.myLogger("initializer") logger.info("Starting raw initializing") input_raw = [] output_raw = [] for call in dataset: input_row = [] output_row = [] # if call.log["session-id"] == "voip-f32f2cfdae-130328_192703": for turn, label in call: input_row.append(turn["output"]["transcript"].lower()) output_row.append(turn["output"]["dialog-acts"]) input_row.append(label["transcription"].lower()) output_row.append(label["semantics"]["json"]) input_raw.append(input_row) output_raw.append(output_row) logger.info("Finish raw initializing") print(len(input_raw)) return {"input": input_raw, "output": output_raw}
def input_tokenize(sentence): logger = myLogger.myLogger("tokenize test") tokens = nltk.word_tokenize(sentence) dictionary = corpora.Dictionary(tokens) print dictionary.token2id logger.info("dictionary test finished")
import myLogger; import Configuration; import datetime; from databaseConn import databaseConn; #### GLOBAL DATA###### myDBConn = databaseConn(); #### MAIN METHOD #### if __name__ == '__main__': _oLog = myLogger.myLogger(moduleName="url_assign_log"); _oLog.info("Started parsing out new URLs"); now = datetime.datetime.now(); date = now.strftime('%Y%m%d'); if(Configuration.getSetting('override_date', '') != ''): date = Configuration.getSetting('override_date', date); try: datetime.strptime(date, "%Y%m%d"); except: _oLog.error("Parsing date is not a valid date. Must be in format: yyyyMMdd"); _oLog.info("Running URL Assign on date: " + str(date)); file = Configuration.getSetting("url_handoff", "C:\\UUMMUU_Code\\Logs\\"+str(date)[0:6]+"\\LINKED_URLS_LOG\\linked_urls_log_"+str(date)+'.log'); try: opened_file = open(file); except: _oLog.info("File is not ready to be parsed. Exiting..."); exit();
''' NOTE: custom status code: 501 for UUMMUU index is an unsupported file type. ''' myDBConn = databaseConn(); result = myDBConn.getDomainCount(sys.argv); try: crawl_domain = result[0][0]; counter = int(result[0][1]); except: logger.error('there was an error getting any site domains to crawl'); #get a list of the links we are to ignore for this domain dont_follow_links = find_RobotsTxt(crawl_domain); #start a logger to keep track of what breaks. logger = myLogger.myLogger(moduleName="url_reader_log"); #make sure we do not crawl too many pages for a single domain. while pageCounter < limit: #an index of all the words. index = {}; total_unique_words = []; repository = {}; ## a list of the links on this page ## link_list = []; ###set up the variables that will be put in the database. USE THESE IN THE SQL STATEMENT site = Site(); try: if(crawl_domain != ''): sql = "SELECT url, id, domain FROM sites_sitequeue WHERE crawled != 1 AND status = 200 AND domain = '%s' ORDER BY id LIMIT 1;" %(crawl_domain);