def parsefile(f, inPre, titleSet, per, loc, org, other): fin = codecs.open(inPre + f, encoding='utf-8') for line in fin: if len(line.strip().split("\t")) != 11: continue ID,url,title,source,created_at,authors,key_word,snippets,raw_text,\ h_tokens_ent,b_tokens_ent = line.strip().split("\t") # h_tokens,b_tokens,\ if title in titleSet: continue else: titleSet.add(title) if len(b_tokens_ent.split()) > MAX_BODY_LEN: continue h_tokens_ent = unidecode.unidecode(h_tokens_ent.strip()) b_tokens_ent = unidecode.unidecode(b_tokens_ent.strip()) #h = grep_ent_with_context(h_tokens_ent,per,loc,org,other) # fds_per_| asked me about ... #b = grep_ent_with_context(b_tokens_ent,per,loc,org,other) h = grep_ent(h_tokens_ent, per, loc, org, other) # fsd_per_| oregon_loc_| ... b = grep_ent(b_tokens_ent, per, loc, org, other) h = rep2.sub('', h) b = rep2.sub('', b) h = my_tokenizer(h, tokenizer) b = my_tokenizer(b, tokenizer) tokens = h + ' ' + h + ' ' + b # title twice yield tokens.lower(), bk.News( ID, title, raw_text, snippets, key_word, source, created_at, f.split('.')[0], h_tokens_ent, b_tokens_ent) # can also leave lowercase to scikit fin.close()
def readfile(file, dataop, count, ind2obj, dtpure, lines): # lines = [] # ind2obj = {} # count = 0 for line in file: if len(line.strip().split( "\t")) == 10: #### after dbpedia entities are stored ID, url, title, source, created_at, authors, key_word, snippets, raw_text, entities = line.strip( ).split("\t") elif len(line.strip().split("\t")) == 9: ID, url, title, source, created_at, authors, key_word, snippets, raw_text = line.strip( ).split("\t") else: continue strAll = (title + raw_text).lower() interested = False if 'powerball' in strAll: # and 'benghazi' in strAll: interested = True if not interested: continue ID = int(ID) ind2obj[count] = bk.News(ID, title, raw_text, snippets, key_word, source, created_at, dtpure) if dataop == "all": lines.append(line) if dataop == "text": lines.append('\t'.join([url, title, key_word, snippets, raw_text])) if dataop == "snippets": lines.append('\t'.join([url, title, key_word, snippets])) count += 1 return count
def parsefile(f,per,loc,org,other): fin = codecs.open(f, encoding = 'utf-8') for line in fin: #if len(b_tokens.split()) > MAX_BODY_LEN: # continue b_tokens_ent = unidecode.unidecode(line.strip()) b = grep_ent(b_tokens_ent,per,loc,org,other) b = rep2.sub('', b) b = my_tokenizer(b, tokenizer) yield b.lower(),bk.News(raw_text=line.strip()) # can also leave lowercase to scikit fin.close()
def readfile(file,dataop,count,ind2obj,lines): # lines = [] # ind2obj = {} # count = 0 for line in file: if len(line.strip().split("\t")) != 9: continue ID,url,title,source,created_at,authors,key_word,snippets,raw_text = line.strip().split("\t") ind2obj[count] = bk.News(ID,title,raw_text,snippets,key_word,source,created_at) if dataop == "all": lines.append(line) if dataop == "text": lines.append('\t'.join([url,title,keywords,snippets,raw_text])) if dataop == "snippets": lines.append('\t'.join([url,title,keywords,snippets])) count += 1 return count
def readfile(file, dataop, count, ind2obj, dtpure, lines): # lines = [] # ind2obj = {} # count = 0 for line in file: if len(line.strip().split( "\t")) != 10: #### after dbpedia entities are stored continue ID, url, title, source, created_at, authors, key_word, snippets, raw_text, entities = line.strip( ).split("\t") ID = int(ID) ind2obj[count] = bk.News(ID, title, raw_text, snippets, key_word, source, created_at, dtpure) if dataop == "all": lines.append(line) if dataop == "text": lines.append('\t'.join([url, title, key_word, snippets, raw_text])) if dataop == "snippets": lines.append('\t'.join([url, title, key_word, snippets])) count += 1 return count