def extract(self): self.doc = BeautifulSoup(self.html) self.text = clean_text(self.html) logging.info("Extract") if self.doc is not None: try: self.title = ( self.doc.find("title").get_text()).encode('utf-8') self.text = (self.doc.find("body").get_text()).encode('utf-8') self.links, self.domains, self.domains_ids = self.fetch_links() self.get_meta() logging.info("Extracted!") return True except Exception as ex: logging.info(ex) self.status = False self.msg = str(ex) self.code = 700 return False else: logging.info("Error in loading html") self.status = False self.msg = "No html loaded" self.code = 700 return False
def search_documents(): query = request.get_json()["query"] terms = clean_text(query) print(query) res = { "docs": [Document(filename).__dict__ for filename in get_filenames()], "query": { "input": query, "term_freq": [] }, "terms": terms } freq = term_freq([doc["content"] for doc in res["docs"]] + [query]) for i in range(len(freq)): if i == len(freq) - 1: res["query"]["term_freq"] = freq[i] else: res["docs"][i]["term_freq"] = freq[i] for doc in res["docs"]: doc["similarity"] = sim(dict_to_vector(res["query"]["term_freq"]), dict_to_vector(doc["term_freq"])) res["docs"] = sorted(res["docs"], key=lambda k: k["similarity"], reverse=True) return jsonify(res)
def clean_other(text): text_ = text.strip().replace("\\", " ") #.split("@") text = clean_text(text_, ["transliteration_cleaners"]) text = expand_abbreviations(text) text = del_space(text) text = re.sub('^ ', '', text) text = re.sub(' $', '', text) return text
def __init__(self, filename): self.filename = filename self.title = (filename.split("."))[0] self.content = file_to_string( f".\\{DOCUMENT_DIRECTORY}\\{self.filename}") self.term_freq = {} self.similarity = 0 self.length = len(clean_text(self.content)) self.first_sentence = get_first_sentence(self.content)
def clean_en(text): text_ = text.strip().replace("\\", " ") #.split("@") text = clean_text(text_, ["english_cleaners"]) text = expand_abbreviations(text) text = re.sub("&", " and ", text) text = re.sub("%", " percent ", text) text = re.sub("\$", " dollar ", text) text = del_space(text) text = re.sub('^ ', '', text) text = re.sub(' $', '', text) return text
def get_passenger(): serializer_fields = [ 'Name', 'SurvivalChance', ] + explanatory_vars closest_passenger = find_closest_passenger( q_name=clean_text(request.args.get('name', ''))) annotate_passenger_with_survival_prediction(closest_passenger) return app.response_class( response=closest_passenger[serializer_fields].to_json(), status=200, mimetype='application/json')
def extract(self): self.doc = BeautifulSoup(self.html) self.text = clean_text(self.html) logging.info("Extract") if self.doc is not None: try: self.title = (self.doc.find("title").get_text()).encode('utf-8') self.text = (self.doc.find("body").get_text()).encode('utf-8') self.links, self.domains, self.domains_ids = self.fetch_links() self.get_meta() logging.info("Extracted!") return True except Exception as ex: logging.info(ex) self.status = False self.msg = str(ex) self.code = 700 return False else: logging.info("Error in loading html") self.status = False self.msg = "No html loaded" self.code = 700 return False
def testCleanText(self): for tex, result in self.knownCleanValues: self.assertEqual(text.clean_text(tex), result)
(i % 1000) / 100) else "" juh = numbers[int((i % 100) / 10)].replace("いち", "") + " じゅう" if int( (i % 100) / 10) else "" ich = numbers[int(i % 10)] if int(i % 10) > 0 else "" return (man + " " + sen + " " + hyk + " " + juh + " " + ich) #ja_out=codecs.open("train.ja","w",encoding='utf8') #hi_out=codecs.open("train.hi","w",encoding='utf8') #import pdb; pdb.set_trace() for i in codecs.open(args.txt, "r", encoding='utf8'): key, text = i.strip().split("@") #text=text.replace(" ","").replace("_"," ") text = clean_text(text, ["english_cleaners"]).replace("-", " ").replace( "?", " ?").replace("!", " !").replace(":", " ").replace(";", " ").replace( " ", " ").replace(",", "") #transliteration_cleaners, #print(key+"@"+" ".join(list(text.replace(" ","_")))) text = expand_abbreviations(text).replace(".", "") text = text.replace(" ", "_") print(key + "@" + " ".join(list(text))) #print(" ".join([number(i) for i in (text.split())])) #print(text) # text=expand_abbreviations(text) # text=check(text) # print(text) #print(key+"@"+" ".join(list(text.replace(" ","_")))) #print(" ".join(list(text.replace(" ","_")))) #han=[] #hira=[]
def test_clean_text(): input_text = "Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)" assert clean_text(input_text) == "potter thomas lily alexenia wilson"
print('Loading ' + args.train_file) # Importing the raw data train = pd.read_csv(args.train_file, usecols=['text', 'event']) num_train = train.shape[0] ids = np.array( [''.join(['record', str(num)]) for num in list(range(num_train))]) np.random.shuffle(ids) train['id'] = ids[0:num_train] print('cleaning text') train['text'] = train['text'].apply(lambda t: preprocess((t))) train.text = pd.Series(clean_text(train.text)) print('clipping') train_lengths = np.array([len(doc.split()) for doc in train.text]) clip_to = np.max(train_lengths) train.text = pd.Series( [' '.join(doc.split()[:clip_to]) for doc in train.text]) # Making a lookup dictionary for the event codes code_df = pd.read_csv( os.path.join(args.data_dir, 'code_descriptions.csv')) codes = code_df.event.values print(codes) code_dict = dict(zip(codes, np.arange(len(codes)))) print(code_dict) train.event = [code_dict[code] for code in train.event]