Пример #1
0
 def extract(self):
     self.doc = BeautifulSoup(self.html)
     self.text = clean_text(self.html)
     logging.info("Extract")
     if self.doc is not None:
         try:
             self.title = (
                 self.doc.find("title").get_text()).encode('utf-8')
             self.text = (self.doc.find("body").get_text()).encode('utf-8')
             self.links, self.domains, self.domains_ids = self.fetch_links()
             self.get_meta()
             logging.info("Extracted!")
             return True
         except Exception as ex:
             logging.info(ex)
             self.status = False
             self.msg = str(ex)
             self.code = 700
             return False
     else:
         logging.info("Error in loading html")
         self.status = False
         self.msg = "No html loaded"
         self.code = 700
         return False
Пример #2
0
def search_documents():

    query = request.get_json()["query"]
    terms = clean_text(query)
    print(query)

    res = {
        "docs": [Document(filename).__dict__ for filename in get_filenames()],
        "query": {
            "input": query,
            "term_freq": []
        },
        "terms": terms
    }

    freq = term_freq([doc["content"] for doc in res["docs"]] + [query])

    for i in range(len(freq)):
        if i == len(freq) - 1:
            res["query"]["term_freq"] = freq[i]
        else:
            res["docs"][i]["term_freq"] = freq[i]

    for doc in res["docs"]:
        doc["similarity"] = sim(dict_to_vector(res["query"]["term_freq"]),
                                dict_to_vector(doc["term_freq"]))

    res["docs"] = sorted(res["docs"],
                         key=lambda k: k["similarity"],
                         reverse=True)

    return jsonify(res)
Пример #3
0
def clean_other(text):
    text_ = text.strip().replace("\\", " ")  #.split("@")
    text = clean_text(text_, ["transliteration_cleaners"])
    text = expand_abbreviations(text)
    text = del_space(text)
    text = re.sub('^ ', '', text)
    text = re.sub(' $', '', text)
    return text
Пример #4
0
 def __init__(self, filename):
     self.filename = filename
     self.title = (filename.split("."))[0]
     self.content = file_to_string(
         f".\\{DOCUMENT_DIRECTORY}\\{self.filename}")
     self.term_freq = {}
     self.similarity = 0
     self.length = len(clean_text(self.content))
     self.first_sentence = get_first_sentence(self.content)
Пример #5
0
def clean_en(text):
    text_ = text.strip().replace("\\", " ")  #.split("@")
    text = clean_text(text_, ["english_cleaners"])
    text = expand_abbreviations(text)
    text = re.sub("&", " and ", text)
    text = re.sub("%", " percent ", text)
    text = re.sub("\$", " dollar ", text)
    text = del_space(text)
    text = re.sub('^ ', '', text)
    text = re.sub(' $', '', text)
    return text
Пример #6
0
def get_passenger():

    serializer_fields = [
        'Name',
        'SurvivalChance',
    ] + explanatory_vars

    closest_passenger = find_closest_passenger(
        q_name=clean_text(request.args.get('name', '')))
    annotate_passenger_with_survival_prediction(closest_passenger)

    return app.response_class(
        response=closest_passenger[serializer_fields].to_json(),
        status=200,
        mimetype='application/json')
Пример #7
0
	def extract(self):
		self.doc = BeautifulSoup(self.html)
		self.text = clean_text(self.html)
		logging.info("Extract")
		if self.doc is not None:
			try:
				self.title = (self.doc.find("title").get_text()).encode('utf-8')
				self.text = (self.doc.find("body").get_text()).encode('utf-8')
				self.links, self.domains, self.domains_ids = self.fetch_links()
				self.get_meta()
				logging.info("Extracted!")
				return True
			except Exception as ex:
				logging.info(ex)
				self.status = False
				self.msg = str(ex)
				self.code = 700
				return False
		else:
			logging.info("Error in loading html")
			self.status = False
			self.msg = "No html loaded"
			self.code = 700
			return False
Пример #8
0
 def testCleanText(self):
     for tex, result in self.knownCleanValues:
         self.assertEqual(text.clean_text(tex), result)
Пример #9
0
            (i % 1000) / 100) else ""
        juh = numbers[int((i % 100) / 10)].replace("いち", "") + " じゅう" if int(
            (i % 100) / 10) else ""
        ich = numbers[int(i % 10)] if int(i % 10) > 0 else ""
        return (man + " " + sen + " " + hyk + " " + juh + " " + ich)


#ja_out=codecs.open("train.ja","w",encoding='utf8')
#hi_out=codecs.open("train.hi","w",encoding='utf8')

#import pdb; pdb.set_trace()
for i in codecs.open(args.txt, "r", encoding='utf8'):
    key, text = i.strip().split("@")
    #text=text.replace(" ","").replace("_"," ")
    text = clean_text(text, ["english_cleaners"]).replace("-", " ").replace(
        "?",
        " ?").replace("!", " !").replace(":", " ").replace(";", " ").replace(
            "  ", " ").replace(",", "")  #transliteration_cleaners,
    #print(key+"@"+" ".join(list(text.replace(" ","_"))))
    text = expand_abbreviations(text).replace(".", "")
    text = text.replace(" ", "_")
    print(key + "@" + " ".join(list(text)))
    #print(" ".join([number(i) for i in (text.split())]))

    #print(text)
#    text=expand_abbreviations(text)
#    text=check(text)
#    print(text)
#print(key+"@"+" ".join(list(text.replace(" ","_"))))
#print(" ".join(list(text.replace(" ","_"))))
#han=[]
#hira=[]
Пример #10
0
def test_clean_text():
    input_text = "Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)"
    assert clean_text(input_text) == "potter thomas lily alexenia wilson"
        print('Loading ' + args.train_file)
        # Importing the raw data
        train = pd.read_csv(args.train_file, usecols=['text', 'event'])

        num_train = train.shape[0]

        ids = np.array(
            [''.join(['record', str(num)]) for num in list(range(num_train))])

        np.random.shuffle(ids)
        train['id'] = ids[0:num_train]

        print('cleaning text')
        train['text'] = train['text'].apply(lambda t: preprocess((t)))

        train.text = pd.Series(clean_text(train.text))

        print('clipping')
        train_lengths = np.array([len(doc.split()) for doc in train.text])
        clip_to = np.max(train_lengths)
        train.text = pd.Series(
            [' '.join(doc.split()[:clip_to]) for doc in train.text])

        # Making a lookup dictionary for the event codes
        code_df = pd.read_csv(
            os.path.join(args.data_dir, 'code_descriptions.csv'))
        codes = code_df.event.values
        print(codes)
        code_dict = dict(zip(codes, np.arange(len(codes))))
        print(code_dict)
        train.event = [code_dict[code] for code in train.event]