Пример #1
0
def get_date(clean_words, question):
    date1 = ''
    date2 = ''
    if "yesterday" in clean_words:
        date1 = date.today() - timedelta(1)
        date1 = date1.strftime("%Y-%m-%d")
    elif "today" in clean_words:
        date1 = date.today().strftime("%Y-%m-%d")

    if date1 == '':
        question = re.sub(r"(?<=\d)(st|nd|rd|th)\b", '', question)
        dates = extract_dates(question)
        try:
            date1 = dates[0].strftime("%Y-%m-%d")
            date2 = dates[1].strftime("%Y-%m-%d")
        except:
            pass

    date_req = []
    if date1 == '':
        return ("Unable to identify the required day for query")
    else:
        date_req.append(date1)
        date_req.append(date2)
        return date_req
Пример #2
0
def date_extract(text):	

	# matches = datefinder.find_dates(text)
	# matches = dparser.parse(text, fuzzy=True)
	dates = extract_dates(text, return_precision=True)

	return dates
Пример #3
0
 def extract_date(text):
     dates = extract_dates(text)
     for d in dates:
         if d is not None:
             temp = int(str(d).split('-')[0])
             if temp in [2018, 2019, 2020, 2017, 2016]:
                 print('Date : ', str(d)[:10])
                 return str(d)[:10]
Пример #4
0
def extract_DOB(text):
    Dob=""
    rawdata=""
    textSplit=text.splitlines()
    count=len(textSplit)
    i=0
    for data in textSplit:
        pattern=re.compile('Date of Birth|dob|DOB|D.O.B|d.o.b|date of birth|Date of birth|dateofbirth|DATE OF BIRTH|Date of Birth')
        temp=pattern.findall(data)
        i=i+1
        if len(temp)!=0:
            rawdata=data
            if i<count:
                rawdata=rawdata+""+textSplit[i]
                final=count-i
                if final>=2:
                    rawdata=rawdata+""+textSplit[i+1]
                    rawdata=rawdata.replace("?","")
                    rawdata=" ".join(rawdata.split())
                    break
    if rawdata != "":
        matches=datefinder.find_dates(rawdata)
        month=""
        day=""
        for match in matches:
            Dob=match
            tempmonth=len(str(Dob.month))
            tempday=len(str(Dob.day))
            if tempmonth!=2:
                month=str(0)+str(Dob.month)
            else:
                month=str(Dob.month)

            if tempday!=2:
                day=str(0)+str(Dob.day)
            else:
                day=str(Dob.day)

            Dob=day+"."+ month +"."+str(Dob.year)
            break

        if Dob=="":
            date=extract_dates(rawdata)
            for match in date:
                Dob=match
                Dob=str(Dob.day)+"-"+str(Dob.month)+"-"+str(Dob.year)
                break
        return Dob
    else:
        return ""
Пример #5
0
 def get_dates(self):
     # https://spacy.io/usage/linguistic-features#101
     ents = [ent for ent in self.doc.ents if ent.label_ == 'DATE']
     for ent in self.doc.ents:
         if ent.label_ == "01/04/1937":
             pass
     dates = list()
     for ent in ents:
         date = dateparser.parse(ent.text)
         dates.append(date)
     if not dates:
         extracted_dates = extract_dates(self.doc.text)
         dates += extracted_dates
     return dates
Пример #6
0
def extract_dol(lines):
    '''take list as input
	break it into lines and 
	extract date from each lines 
	if present
	'''

    dates_arr = []
    for line in lines:
        dates = date_extractor.extract_dates(line)
        for date in dates:
            dates_arr.append(date.date())

    return dates_arr
def get_dates(url):
    try:
        date_str = ''
        html_doc = requests.get(url).text
        text_doc = BeautifulSoup(html_doc, 'html.parser').text
        dates = extract_dates(text_doc)
        for d in dates:
            if isinstance(d, datetime.datetime):
                date_str += d.strftime('%Y-%m-%d') + '\n'
    except RequestException as error:
        date_str = 'Request error: {}'.format(error)
        print('Request error: {}'.format(error))
    except BaseException as error:
        date_str = 'error: {}'.format(error)
        print('error: {}'.format(error))
    return date_str
Пример #8
0
def ParseDateStr(dstr):
    if len(dstr) == 0:
        return ''
    dstr = ' ' + dstr.lower() + ' '
    dstr = resub('[\W\d](th|st|nd|rd)\W', '', dstr).strip()

    try:
        dobjs = extract_dates(dstr, return_precision=True, debug=False)
        for o in dobjs:
            if o[1] == 'day':
                dobj = o[0].replace(tzinfo=None)
                if dobj <= cdate:
                    return dobj
    except:
        return ''

    return ''
Пример #9
0
def dateExtractor(filename):
    """
    This function will handle extracting date from text.
    """
    try:
        text = textExtraction(filename)
        date = extract_dates(text)
        # print((date[0]))
        # dateFormated=date[0].strftime("%Y-%m-%d")
        for i in date:
            if i != 'None':
                i = i.strftime("%Y-%m-%d")
            else:
                date.remove(i)
        if len(date):
            return {'date': date}
        else:
            return {'date': 'null'}
    except:
        return {"error": "Some error occures"}
Пример #10
0
def DATETIME_to_iso(datetime_string):
    formatted_dates = []
    matches = extract_dates(datetime_string)
    for match in matches:
        if match == None:
            break
        formatted_dates.append(match.isoformat())

    if len(formatted_dates) == 0:
        cal = parsedatetime.Calendar()
        formatted_dates = []
        dates = datetime_string.split(" and ")
        if len(dates) == 1:
            dates = dates[0]
            dates = dates.split(" to ")
        for date_string in dates:
            time_struct, parse_status = cal.parse(date_string)
            date = datetime(*time_struct[:6])
            formatted_dates.append(date.isoformat())

    return ('/'.join(formatted_dates))
Пример #11
0
def date_extract(ls):
    '''
    Function to extract dates from a list of words
    :param ls: list of words
    :return: dates extracted via two sources
    '''
    date_ner = []
    date_extractor = []
    for i in ls:
        if "issue date" in i:
            print(i)
        dates = extract_dates(i)
        if (len(dates) != 0):
            print("Date Extractor:", str(dates[0].date()))
            date_extractor.append(dates[0])
        doc = nlp(i)
        for j in doc.ents:
            if j.label_ == "DATE":
                print("Issue date: ", j)
                date_ner.append(j)

    return date_ner, date_extractor
Пример #12
0
def analyze(request):

    puncts = string.punctuation
    word_to_find = request.POST.get("word_input")
    djText = request.POST.get('text', 'default')
    remPunc = request.POST.get('option', 'removepunc')
    cap = request.POST.get('option', 'capitalize')
    small = request.POST.get('option', 'toSmall')
    upper = request.POST.get('option', 'toUpper')
    word_find_flag = request.POST.get('option', 'word_find')
    New_Line = request.POST.get('option', 'New_line')
    Emails = request.POST.get('option', 'Email_Address')
    Links = request.POST.get('option', 'Links')
    Passgen = request.POST.get('option', 'Password_Generator')
    search_word = request.POST.get('option', 'Search_word')
    gallery = request.POST.get('option', 'q')
    Suggest_word = request.POST.get('option', 'suggest_word')
    Sen_Analysis = request.POST.get('option', 'Sentiment')
    Grammar = request.POST.get('option', 'grammar')
    Channel = request.POST.get('option', 'suggest_youtube')
    books = request.POST.get('option', 'suggest_books')
    articles = request.POST.get('option', 'suggest_articles')
    lemmitizer = request.POST.get('option', 'grammar')
    start_pdf = request.POST.get('option', 'generate_pdf')
    replace_text = request.POST.get('option', 'replace')
    Word_cloud = request.POST.get('option', 'wordcloud')
    Date = request.POST.get('option', 'date')
    Word_frequency = request.POST.get('option', 'word_frequency')

    analyzed_text = ""
    word_status = ""

    countword = len(djText.split())

    if word_find_flag == "word_find":
        if word_to_find != "":
            if djText.find(word_to_find) != -1:
                word_status = "found"
                word = djText.replace(
                    word_to_find,
                    f"""<b style="color:{"red"};">""" + word_to_find + "</b>")
                djText = word

                try:
                    synonym_01 = get_synonyms(word_to_find)
                    synonyms2 = random.sample(synonym_01, 4)

                    final = ""
                    for f in synonyms2:
                        final += f + " , "

                    example = get_example(word_to_find)

                    synonyms = final + example

                except:
                    synonyms = "Not Available"

            else:
                word_status = "not found"
                synonyms = "Text Not Found"

            analyzed_text = djText
            word_find = "Find Word = " + word_to_find
            synonym = format_html('<b style="color:{};">{}</b>', 'green',
                                  synonyms)

            result = {
                "analyzed_text": analyzed_text,
                "highlight":
                "Chosen word is highlighted in red colour and synonyms/examples in green colour",
                "purpose": word_find,
                "status": word_status,
                "synonym": synonym,
                "wordcount": countword,
                "analyze_text": True,
                "findWord": True
            }

    elif New_Line == "New_line":
        for char in djText:
            if char == '.':
                char = '\n'
            analyzed_text = analyzed_text + char
        result = {
            "analyzed_text": analyzed_text,
            "purpose": "Changes '.' to New Line",
            "analyze_text": True,
            "wordcount": countword
        }
    elif Emails == "Email_Address":
        regex = '^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}$'
        lst = re.findall('\S+@+\S+', djText)
        tmp = ""
        for x in lst:
            if (re.search(regex, x)):
                tmp += x
                tmp += '\n'
        result = {
            "analyzed_text": tmp,
            "purpose": "Find All Emails",
            "analyze_text": True,
            "wordcount": countword
        }

    elif Passgen == "Password_Generator":
        stop_words = set(stopwords.words('english'))
        chars = "!£$%&*#@"
        ucase_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        text = re.sub(r'[^\w\s]', '', djText)
        token = word_tokenize(text)

        filtered_sentence = []

        for w in token:
            if w not in stop_words:
                filtered_sentence.append(w)

        if len(filtered_sentence) > 0:
            random_word = random.choice(filtered_sentence)
        else:
            random_word = token[0]

        random_word = random_word.title()

        merge = ""
        for word in random_word.split():
            merge+=random.choice(chars)+word[:-1]+ word[-1].upper()\
            +random.choice(string.ascii_letters)+"@"+random.choice(ucase_letters)\
            +random.choice(string.digits)+" "
        final_text = merge[:-1]
        result = {
            "analyzed_text": final_text,
            "purpose": "Generate password from text",
            "generate_text": True,
            "wordcount": countword
        }

    elif search_word == "Search_word":
        url = 'https://www.dictionary.com/browse/'
        headers = requests.utils.default_headers()
        headers.update({
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
        })
        req = requests.get(url + djText, headers)
        soup = BeautifulSoup(req.content, 'html.parser')
        mydivs = soup.findAll("div", {"value": "1"})[0]
        for tags in mydivs:
            meaning = tags.text
        wrap = textwrap.TextWrapper(width=100)
        word_meaning = wrap.fill(text=meaning)
        result = {
            "analyzed_text": word_meaning,
            "purpose": "Searched Word",
            "generate_text": True,
            "wordcount": countword
        }

    elif Suggest_word == "suggest_word":
        find = requests.get(
            f"https://www.dictionaryapi.com/api/v3/references/thesaurus/json/{djText}?key={api_key}"
        )
        response = find.json()

        if len(response) == 0:
            print("Word Not Recognized!")
        else:
            k = []
            if str(response[0]).count(" ") == 0:
                for j in range(len(response)):
                    k.append(response[j])
                predict = " , ".join(k)
                djText = predict

            else:
                dictionary = PyDictionary()
                testdict = dictionary.synonym(djText)
                suggest = " , ".join(testdict)
                djText = suggest
            wrap = textwrap.TextWrapper(width=100)
            suggest = wrap.fill(text=djText)

        result = {
            "analyzed_text": suggest,
            "purpose": "Suggested Word",
            "generate_text": True,
            "wordcount": countword
        }

    elif Sen_Analysis == "Sentiment":

        djText = ' '.join(
            re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
                   djText).split())

        analysis = TextBlob(djText)
        # set sentiment
        if analysis.sentiment.polarity > 0:
            final = str(djText) + " (Positive Text)"
        elif analysis.sentiment.polarity == 0:
            final = str(djText) + " (Neutral Text)"
        else:
            final = str(djText) + " (Negative Text)"

        result = {
            "analyzed_text": final,
            "purpose": "Sentiment Analysis",
            "analyze_text": True,
            "wordcount": countword
        }

    elif Grammar == "grammar":
        parser = GingerIt()
        result = parser.parse(djText)
        final = result["result"]

        if final == '':
            final = "Please write some text to check grammar"

        result = {
            "analyzed_text": final,
            "grammar": djText,
            "purpose": "Spelling & Grammar Check",
            "analyze_text": True,
            "wordcount": countword
        }

    elif lemmitizer == "lemmitize":
        wordnet_lemmatizer = WordNetLemmatizer()
        tokenization = nltk.word_tokenize(djText)
        count = True
        for w in tokenization:
            k = wordnet_lemmatizer.lemmatize(w, pos="v")
            if w != k:
                result = "{} -> {}".format(
                    w, wordnet_lemmatizer.lemmatize(w, pos="v"))
                count = False
        if count == True:
            final = "No need for lemmatization"
        if count == False:
            final = "(Original word) - > (Lemmatized word)"

        result = {
            "analyzed_text": result,
            "highlight": final,
            "purpose": "Lemmatization of text",
            "analyze_text": True,
            "wordcount": countword
        }

    elif Channel == "suggest_youtube":
        request.session['user-input'] = djText
        result = {
            "analyzed_text": djText,
            "purpose": "Suggest youtube channels",
            "status": "Press Button To View Channel links",
            "find_channel": True,
            "generate_text": True,
            "wordcount": countword
        }

    elif books == "suggest_books":
        request.session['user-input'] = djText
        result = {
            "analyzed_text": djText,
            "purpose": "Search Books",
            "status": "Press Button To View Books",
            "find_books": True,
            "generate_text": True,
            "wordcount": countword
        }

    elif articles == "suggest_articles":
        request.session['user-input'] = djText
        result = {
            "analyzed_text": djText,
            "purpose": "Search Articles",
            "status": "Press Button To View Articles",
            "find_articles": True,
            "generate_text": True,
            "wordcount": countword
        }

    elif start_pdf == "generate_pdf":
        request.session['user-input'] = djText
        result = {
            "analyzed_text": "Check Your Pdf",
            "purpose": "Generate Pdf",
            "status": "Press Button To View Pdf",
            "make_pdf": True,
            "generate_text": True,
            "wordcount": countword
        }

    elif replace_text == "replace":
        final_text = re.sub(word_to_find, replace_input, djText)
        result = {
            "analyzed_text": final_text,
            "purpose": "Replacemet of text in sentence",
            "analyze_text": True,
            "wordcount": countword
        }

    elif Word_cloud == "wordcloud":
        cloud = WordCloud(background_color="white",
                          max_words=200,
                          stopwords=set(STOPWORDS))
        wc = cloud.generate(djText)
        buf = io.BytesIO()
        wc.to_image().save(buf, format="png")
        data = base64.b64encode(buf.getbuffer()).decode("utf8")
        final = "data:image/png;base64,{}".format(data)

        result = {
            "analyzed_text": " ",
            "purpose": "Wordcloud",
            "my_wordcloud": final,
            "generate_text": True,
            "wordcount": countword
        }

    elif Date == "date":
        final = extract_dates(djText)
        final_text = final[0].date()

        result = {
            "analyzed_text": final_text,
            "purpose": "Extract Dates from text",
            "analyze_text": True,
            "wordcount": countword
        }

    elif Word_frequency == "word_frequency":
        input_text = djText.replace("\n", " ")
        djText = input_text.lower()

        words_dict = get_words_dict(djText)
        # create graph
        if len(words_dict) > 10:
            k = 10
        else:
            k = len(words_dict)

        y_pos = range(0, k)
        bars = []
        height = []
        count = 0

        # print and save values to graph
        format_spaces("word", "occurrences")
        for word_str, word_amount in words_dict.items():
            format_spaces(word_str, word_amount)
            count += 1
            if count <= 10:
                bars.append(word_str)
                height.append(int(word_amount))
            else:
                pass

        # # Create bars
        plt.bar(y_pos, height)

        # Create names on the x-axis
        plt.xticks(y_pos, bars, size=9)

        plt.xticks(rotation='horizontal')
        plt.ylabel('Word Frequency', fontsize=12, labelpad=10)
        plt.xlabel('Words', fontsize=12, labelpad=10)

        fig = plt.gcf()

        buf = BytesIO()
        fig.savefig(buf, format='png')
        buf.seek(0)
        data = base64.b64encode(buf.read())
        uri = urllib.parse.quote(data)
        final = "data:image/png;base64,{}".format(uri)

        result = {
            "analyzed_text": " ",
            "purpose": "Word Frequency for every word in text",
            "bar_graph": final,
            "analyze_text": True,
            "wordcount": countword
        }

    elif gallery == "q":
        request.session['user-input'] = djText
        result = {
            "analyzed_text": djText,
            "purpose": "Images",
            "status": "Press Button To View Images",
            "find_image": True,
            "generate_text": True,
            "wordcount": countword
        }

    elif remPunc == 'removepunc':
        for char in djText:
            if char not in puncts:
                analyzed_text = analyzed_text + char
        result = {
            "analyzed_text": analyzed_text,
            "purpose": "Remove Punctuations",
            "analyze_text": True,
            "wordcount": countword
        }
    elif cap == "capitalize":
        analyzed_text = djText.capitalize()

        result = {
            "analyzed_text": analyzed_text,
            "purpose": "Capitalize",
            "analyze_text": True,
            "wordcount": countword
        }

    elif small == "toSmall":
        analyzed_text = djText.lower()

        result = {
            "analyzed_text": analyzed_text,
            "purpose": "To Smallercase",
            "analyze_text": True,
            "wordcount": countword
        }

    elif upper == "toUpper":
        analyzed_text = djText.upper()

        result = {
            "analyzed_text": analyzed_text,
            "purpose": "To Uppercase",
            "analyze_text": True,
            "wordcount": countword
        }
    elif Links == "Links":
        pattern = '(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])'
        links = re.findall(pattern, djText, re.IGNORECASE)
        analyzed_text = ""

        i = 0
        for x in links:
            i = i + 1
            analyzed_text += f'<a href="{x}" target="_blank">Link {i}</a>'
            analyzed_text += '\n '

        result = {
            "analyzed_text": analyzed_text,
            "purpose": "Find All Links",
            "analyze_text": True,
            "wordcount": countword
        }

    else:
        return HttpResponse(
            '''<script type="text/javascript">alert("Please select atleast one option.");</script>'''
        )

    return render(request, 'analyze.html', result)
Пример #13
0
from date_extractor import extract_dates
import datetime
import pytesseract
from PIL import Image
import datefinder
import numpy as np
import datetime
from datetime import date
import re

dtype = np.int64
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
text = pytesseract.image_to_string(
    Image.open(
        "F:\Pycharm\Machine-Learning\Data_Extraction\Dataset\img1.jpeg"))
# print(text)
# text = text.replace(" ", "")
dates = extract_dates(text)
# x = datetime.datetime(dates)
print(dates)
for i in dates:
    print('the extracted date is', i.strftime("%d %b %Y"))
    break

# print(text)
# st = text.split("\n") #  list containg the text of the image
Пример #14
0
def process_sample(index,pdf_path, database_name, table_name, system_username, corenlp_ptr, degrees_of_rotation, fp,compiled_DDMMYYYY_date_pattern,
												compiled_YYYYMMDD_date_pattern,compiled_MMDDYYYY_date_pattern,compiled_PHN_pat):	
	text = p2t.convert_pdf_to_txt(pdf_path, degrees_of_rotation)
		
	#per_day_num = tuple:(PERSON[], DATE[], NUMBER[])
	per_day_num = interact.annotate_ner_with_corenlp(text.replace(",",""), corenlp_ptr)
	#filep = open("converted_text_{}.txt".format(str(index)),"w")
	#filep.write(text)
	valid_dates = []
	#each of these strip_dates function calls appends each valid date match to the valid_dates list
	strip_dates(per_day_num[1],compiled_DDMMYYYY_date_pattern,valid_dates, DDMMYYYY=True, MMDDYYYY = False, YYYYMMDD = False )
	strip_dates(per_day_num[1],compiled_YYYYMMDD_date_pattern,valid_dates,DDMMYYYY= False, MMDDYYYY = False, YYYYMMDD = True)
	strip_dates(per_day_num[1],compiled_MMDDYYYY_date_pattern,valid_dates, DDMMYYYY= False,MMDDYYYY = True, YYYYMMDD = False)
	find_dates(text,compiled_DDMMYYYY_date_pattern,valid_dates, DDMMYYYY=True, MMDDYYYY = False, YYYYMMDD = False )
	find_dates(text,compiled_YYYYMMDD_date_pattern,valid_dates,DDMMYYYY= False, MMDDYYYY = False, YYYYMMDD = True)
	find_dates(text,compiled_MMDDYYYY_date_pattern,valid_dates, DDMMYYYY= False,MMDDYYYY = True, YYYYMMDD = False)
	found_datetimes=[]	
	for date in valid_dates:
		try:
			if 0<int(date[1])<13 and 0<int(date[2])<32 and 1900 < int(date[0])< 2018:
				
				found_datetimes.append(datetime.date(int(date[0]),int(date[1]),int(date[2])))
		except Exception:
			continue
	
	found_datetimes = [datetime.date(int(date[0]),int(date[1]),int(date[2])) for date in valid_dates if 0<int(date[1])<13 and 0<int(date[2])<32 and 1900 < int(date[0])< 2018]
	extracted_dates = date_extractor.extract_dates(text)
	extracted_dates = [dt.date() for dt in extracted_dates if dt]
	found_datetimes+= extracted_dates
	"""
	#print("PERSON list :",str(per_day_num[0]))
	#print("CoreNLP's DATE list: ", str(per_day_num[1]))
	#print("NUMBER list: ", str(per_day_num[2]))
	#print("Regular expression's DATES list:", str(valid_dates))
	#print("Datetime.date objects: ", str(found_datetimes))
	#print("VALID PHN list: ", PHN_identifier(per_day_num[2],compiled_PHN_pat))
	#print("PATIENT HYPOTHESIS from highest frequency: " , patient_hypothesis(per_day_num[0]))
	"""
		
	fp.write("{}\nTest case #{} processed: ".format(str(pdf_path),index))
	fp.write("Person List: "+ str(per_day_num[0])+"\n\n")
	fp.write("CoreNLP's Date List: "+ str(per_day_num[1])+"\n\n")
	fp.write("Extracted dates with date-extractor: " + str(extracted_dates)+"\n\n")
	fp.write("Number list: "+ str(per_day_num[2])+"\n\n")
	fp.write("Verified Date List: "+ str(valid_dates)+"\n\n")
	fp.write("Valid PHN List: "+ str(PHN_identifier(per_day_num[2], compiled_PHN_pat))+"\n\n")

	db= db_interaction.make_connection_to_db(database_name, system_username)
	
	#####################################################################################################
	#combining the dates in this step
	
	
	PHN_vs_DOB_vs_partial_name_results =db_interaction.PHN_vs_DOB_vs_partial_name_query(db, PHN_identifier(per_day_num[2],compiled_PHN_pat), found_datetimes,per_day_num[0], table_name)
	PHN_vs_DOB_results = db_interaction.PHN_vs_DOB_query(db, PHN_identifier(per_day_num[2],compiled_PHN_pat), found_datetimes, table_name)
	PHN_vs_partial_name_results = db_interaction.PHN_vs_partial_name_query(db, PHN_identifier(per_day_num[2],compiled_PHN_pat), per_day_num[0], table_name)
	DOB_vs_partial_name_results = db_interaction.DOB_vs_partial_name_query(db, found_datetimes, per_day_num[0], table_name)
	
	#This patient prediction is the variable which should be used to determine where the sample gets filed
	################################################################################################################################################## THERES A NONE HERE TO REPRESENT POSSIBLE BOTTOM UP MATCHES
	patient_prediction_result = patient_hypothesis((PHN_vs_DOB_vs_partial_name_results,PHN_vs_DOB_results,PHN_vs_partial_name_results,DOB_vs_partial_name_results,None))
	
	fp.write("\nPatient Hypothesis: " + str(patient_prediction_result)+" for {}".format(str(pdf_path)))
	fp.write("\nA: Matches crossreferencing the PHN vs DOB vs partial found names\n" + str(PHN_vs_DOB_vs_partial_name_results))
	fp.write("\nB: Matches crossreferencing the PHN vs DOB:\n" + str(PHN_vs_DOB_results))
	fp.write("\nC: Matches crossreferencing the PHN vs partial found names:\n" + str(PHN_vs_partial_name_results))
	fp.write("\nD: Matches crossreferencing the DOB vs partial found names:\n" + str(DOB_vs_partial_name_results))
	fp.write("\n Matches found using only DOB: " + str(db_interaction.DOB_query(db,found_datetimes,table_name)))
	fp.write("\n\n\n TEXT EXTRACTED: " + text)
	fp.close()
	return patient_prediction_result
Пример #15
0
import pytz
import date_extractor

text = "need to get two signatures."
dates = date_extractor.extract_dates(text)

print(dates)
Пример #16
0
    ## Define regex for cusip
    regex = "[0-9]{5}[a-z]{3}[0-9]{1}"

    flag = 0
    flag_issue = 0
    date_list = []

    for i in range(pdfReader.numPages):
        pageObj = pdfReader.getPage(i)
        text = pageObj.extractText()

        ls = text.split("\n")
        #print(ls)
        for word in ls:
            if (flag == 0):
                date = extract_dates(word)
                #if(len(date)>0):
                #    print("Doc date: ",date)
                doc = nlp(word)
                for j in doc.ents:
                    if j.label_ == "DATE":
                        print("DOC DATE:", j)
                        date_list.append(j)
                        flag = 1
                        #date_ner.append(j)
                        break

        ls = listtoLower(ls)
        for index in range(len(ls)):
            if ("issue date" in ls[index]):
                #print(ls[index])