def tag(text): text = tp.preprocess(text) #print text t1 = time.time() outlist = post.tag(text.split()) t2 = time.time() print "POS Tagging complete. Time taken: ", t2-t1, " seconds" return outlist
def performTask(rawtext): text=textprocess.preprocess(rawtext) POS_text = pt.tag(text) print POS_text #Didn't do toLower earlier because the stanford tagger might make # use of the capitalizations text = text.lower() skilldict = buildskilldict(skills) naive_skills= generate_naive(text,skilldict) expanded_skills = new_guesses(POS_text,naive_skills,words,clusters) return naive_skills, expanded_skills
def process_facc1_with_fileobj(facc1_obj, clueweb_obj, logout=sys.stdout, logerr=sys.stderr): nlpobj = tp.init_CoreNLPServer() record = clueweb_obj.read_record() entity_set = set() is_a_new_record = True for line in facc1_obj: (trec_id, encoding, entity_name, entity_start, entity_end, _, __, freebase_id) = line.strip().split('\t') # We can iterate over the facc1 file along the clueweb09 file, because # these files are all organized linearly and ordered by the WARC-TREC-ID. while record is not None and 'warc-trec-id' not in record or record['warc-trec-id'] != trec_id: record = clueweb_obj.read_record() is_a_new_record = True sentences = [] if record is None: break try: if is_a_new_record: is_a_new_record = False html_data = tp.preprocess(record.payload) # each time a new html file is parsed, a new entity_set must be presented. entity_set.clear() #logerr.flush() #logout.flush() sentences = tp.get_sentences_from_html_v2(html_data, nlpobj) #tp.output_html(trec_id, html_data) #tp.output_sentences(trec_id, sentences) except: logerr.write("\t".join((line.strip(), "failed_to_get_sentences", re.sub(r'\r\n', '', html_data))) + "\n") continue if freebase_id in entity_set: continue else: entity_set.add(freebase_id) # take the longest sentence from those the entity exists try: sentence = max((s for s in sentences if entity_name in s), key=len) except ValueError: logerr.write(line.strip() + "\tentity_not_found\n") continue logout.write("\t".join(x for x in ( trec_id, entity_name, freebase_id, re.sub(r'\t', u' ', sentence).encode('utf-8') )) + "\n")
def naive(text, skillfilter=None, jointfilter=True): # preprocess text = textprocess.preprocess(text) # generate word scores wordscores = calculateWordScores(text) # tokenize tokens = text.split() # prefilter if skillfilter == 'pre': tokens = [token for token in tokens if token in skilldict] phraseList = [ngram for n in range(3) for ngram in generate_ngrams(tokens, n + 1)] keywordcandidates = generateCandidateKeywordScores(phraseList, wordscores) scored_ngrams = sorted(keywordcandidates.iteritems(), key=operator.itemgetter(1), reverse=True) # post-filter if skillfilter == 'post': scored_ngrams = [(ngram, score) for (ngram, score) in scored_ngrams if ngram in skilldict] # format return [{'keyword':pair[0], 'weight':pair[1]} for pair in scored_ngrams]