Exemplo n.º 1
0
    def test_runs(self):

        #read sentences from a file
        import csv
        from findCon import run_singlep,run_multip

    	conlist = []
	with open('./concepts.list','r') as csvfile:
	    reader = csv.reader(csvfile)
	    for row in reader:
		conlist.append(Concept(row[0]))

        with open('./test.list','r') as csvfile:
           
            reader = csv.reader(csvfile,delimiter=',')
            for row in reader:
               
                specs = {
                            'sen': Sentence(row[0]),
                            'conlist': conlist
                        }

		y1 = run_multip(specs)
		y2 = run_singlep(specs)	
                y3 = [r for r in row[1:] if r!='']
		case = set(y1)==set(y2)==set(y3)
		self.assertTrue(case)
Exemplo n.º 2
0
def texts_to_sentences(id, texts):
    """
    Converts texts into sentences
    :type texts: list of texts
    :return: list of sentences
    """
    sentences = []
    sentence = Sentence()
    distance_from_run_offset = 0
    for text in texts:
        content = ''
        beginningOfSentenceOffset = text.run_offset
        chars = text.processed_content()
        for char in chars:
            content = content + char
            distance_from_run_offset += 1

            # if content is only bolded or italicized text, it should not be its own sentence
            contentCopy = copy.deepcopy(content)
            result = re.search('(BOLD_END.*<BOLD>)|(ITALICS_END.*<ITALICS>)',
                               contentCopy)
            final = False
            if result is not None and len(result.group()) > 14:
                final = True

            # if reach a period
            if '.' in char and (('<BOLD>' not in contentCopy[:7] or final) and
                                ('<ITALICS>' not in contentCopy[:10]
                                 or final)) or content.count('.') == 2:
                sentence.add_text(
                    Text(text.run_offset, text.content,
                         beginningOfSentenceOffset - text.run_offset,
                         len(content)))
                sentences.append(sentence)
                sentence = Sentence()
                content = ''
                beginningOfSentenceOffset = text.run_offset + distance_from_run_offset

        sentence.add_text(
            Text(text.run_offset, text.content,
                 beginningOfSentenceOffset - text.run_offset, len(content)))
        distance_from_run_offset = 0

    if content != '':
        sentences.append(sentence)
    return sentences
Exemplo n.º 3
0
# final_df = final_df[~final_df.business_id.isnull()]
final_df = final_df.dropna(axis=0, subset=[
    'sentence', 'business_id'
])  ## MY ADDITION HOPING IT DOESN"T MESS THINGS UP
final_df.to_pickle('final_df.pkl')  ## REMOVE

# FEATURIZE

## Import Sentence class from this project
# import sys
# sys.path.append("/var/www/sandbox/ben/opinion-mining")
from classes.sentence import Sentence

print "Featurizing the training data frame (may take a little while)"

sents = [Sentence(sent) for sent in final_df.sentence]

for sent, stars in zip(sents, final_df.review_stars):
    sent.stars = stars  # pass the number of stars in

featurized_df = pd.DataFrame([sent.get_features() for sent in sents])

featurized_df['sentiment'] = final_df.sentiment
featurized_df = featurized_df[~featurized_df.sentiment.isnull()]

print "Done."
# ipdb.set_trace()

# Adjust sentiment labels
featurized_df.sentiment[featurized_df.sentiment == 'Positive'] = 1
featurized_df.sentiment[featurized_df.sentiment == 'Negative'] = -1
Exemplo n.º 4
0
from classes.sentence import Sentence
from classes.tag import Tag

tag1 = Tag(1, 6, "mylabel")
tag2 = Tag(10, 16, "something")
tag3 = Tag(25, 30, "whatever")

sentence = Sentence("black", [tag1, tag2, tag3])

print(sentence.sentence)
print(sentence.tags[0].start)
Exemplo n.º 5
0
        description='Script to convert a list of strings to concepts')

    #epochs, batch_size and model ID
    parser.add_argument('--sen',
                        type=str,
                        default='Which restaurants do West Indian ?food',
                        help='sentence string')
    parser.add_argument(
        '--fpath',
        type=str,
        default='./concepts.list',
        help='location of file containing the list of concepts')
    args = parser.parse_args()

    #arguments from the parser
    sen = Sentence(args.sen)
    conlist_fpath = args.fpath

    #read the list of concepts from the specified file
    conlist = []
    with open(conlist_fpath, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            conlist.append(Concept(row[0]))
    '''
    the preprocessing innvolves on-the-fly generation of table 
    for each concept in the csv. This can be either be done apriori
    for the whole databse or can be done on-the-fly got batches of data
    for data_parallilsm case
    '''
Exemplo n.º 6
0
def preprocessText(text_file):
    number = 0
    currentIndex = 0  # Could be 6 to keep the current numbers of the original annotations
    sentences = []
    subject = ""
    additionalInformation = ""

    file = open(text_file, 'r')
    text = file.read()
    separateLines = text.splitlines()
    beginIndex = separateLines.index("<TEXT>") + 1
    endIndex = separateLines.index("</TEXT>")

    # For wsj files
    if "wsj" in text_file:
        for s in range(len(separateLines)):
            if "DOCNO" in separateLines[s]:
                documentNo = separateLines[s][8:-9]
            if "<HL>" in separateLines[s]:
                beginSubject = s
            if "</HL>" in separateLines[s]:
                endSubject = s

        # Getting document data
        date = separateLines[endSubject + 1]
        journal = separateLines[endSubject + 2]
        for part in range(0, beginIndex - 1):
            if beginSubject <= part <= endSubject - 1:
                subject += separateLines[part] + str(" ")
            elif part == endSubject:
                subject += separateLines[part]

            if part > endSubject + 2:
                if "<DATELINE>" in separateLines[part]:
                    additionalInformation += separateLines[part][10:-11]
                else:
                    additionalInformation += separateLines[part]

        subject = subject[5:-6]

        dateParts = date.split("/")
        if (int(dateParts[2])) > 20:
            newDate = "19" + dateParts[2][:2]
        else:
            newDate = "20" + dateParts[2][:2]
        newDate = newDate + "-" + dateParts[0][1:] + "-" + dateParts[
            1] + "T00:00:00"

    # FOR ABC files
    if "ABC" in text_file:
        beginIndex = beginIndex + 1  # Extra enter between <TEXT> and the text
        subject = "NEWS STORY"
        journal = "broadcast news"
        for s in range(len(separateLines)):
            if "DOCNO" in separateLines[s]:
                documentNo = separateLines[s][8:-9]
                break
        newDate = documentNo[3:7] + "-" + documentNo[7:9] + "-" + documentNo[
            9:11] + "T00:00:00"

    if "ea" in text_file or "ed" in text_file:
        beginIndex = beginIndex + 1  # Extra enter between <TEXT> and the text
        subject = "NEWS STORY"
        journal = "broadcast"
        for s in range(len(separateLines)):
            if "DOCNO" in separateLines[s]:
                documentNo = separateLines[s][8:-9]
                break
        newDate = "19" + documentNo[2:4] + "-" + documentNo[
            4:6] + "-" + documentNo[6:8] + "T00:00:00"

    if "PRI" in text_file:
        beginIndex = beginIndex + 1  # Extra enter between <TEXT> and the text
        subject = "NEWS STORY"
        journal = "broadcast"
        for s in range(len(separateLines)):
            if "DOCNO" in separateLines[s]:
                documentNo = separateLines[s][8:-9]
                break
        newDate = documentNo[3:7] + "-" + documentNo[7:9] + "-" + documentNo[
            9:11] + "T00:00:00"

    if "CNN" in text_file:
        beginIndex = beginIndex + 1  # Extra enter between <TEXT> and the text
        subject = "NEWS STORY"
        journal = "CNN"
        for s in range(len(separateLines)):
            if "DOCNO" in separateLines[s]:
                documentNo = separateLines[s][8:-9]
                break
        newDate = documentNo[3:7] + "-" + documentNo[7:9] + "-" + documentNo[
            9:11] + "T00:00:00"

    if "VOA" in text_file:
        beginIndex = beginIndex + 1  # Extra enter between <TEXT> and the text
        subject = "NEWS STORY"
        journal = "VOA"
        for s in range(len(separateLines)):
            if "DOCNO" in separateLines[s]:
                documentNo = separateLines[s][8:-9]
                break
        newDate = documentNo[3:7] + "-" + documentNo[7:9] + "-" + documentNo[
            9:11] + "T00:00:00"

    if "XIE" in text_file:
        subject = "NEWS STORY"
        journal = "broadcast"
        for s in range(len(separateLines)):
            if "DOCNO" in separateLines[s]:
                documentNo = separateLines[s][9:-9]
            if "DATE_TIME" in separateLines[s]:
                datefile = separateLines[s][12:-13]
            if "<HEADLINE>" in separateLines[s]:
                start_subject = s + 1
            if "</HEADLINE>" in separateLines[s]:
                end_subject = s

        for line in range(start_subject, end_subject):
            subject += separateLines[line] + " "
        newDate = datefile[1:] + "T00:00:00"

    # FOR AP files
    if "AP9" in text_file:
        subject = ""
        journal = "Associated Press Writer"
        for s in range(len(separateLines)):
            if "DOCNO" in separateLines[s]:
                documentNo = separateLines[s][8:-9]
            if "<SECOND>" in separateLines[s]:
                start_subject = s + 1
            if "LaserPhotos" in separateLines[s]:
                end_subject = s

        for line in range(start_subject, end_subject):
            subject += separateLines[line]

        newDate = "19" + documentNo[2:4] + "-" + documentNo[
            4:6] + "-" + documentNo[6:8] + "T00:00:00"

    if "APW" in text_file or "NYT" in text_file:
        subject = ""
        journal = "Associated Press Writer"
        if "NYT" in text_file:
            journal = "NYT"
        for s in range(len(separateLines)):
            if "DOCNO" in separateLines[s]:
                documentNo = separateLines[s][8:-9]
            if "DATE_TIME" in separateLines[s]:
                datefile = separateLines[s][12:-13]
            if "<HEADLINE>" in separateLines[s]:
                start_subject = s + 1
            if "</HEADLINE>" in separateLines[s]:
                end_subject = s

        for line in range(start_subject, end_subject):
            subject += separateLines[line] + " "

        for add in range(end_subject + 1, beginIndex - 1):
            additionalInformation += separateLines[add] + " "

        newDate = datefile[6:10] + "-" + datefile[:2] + "-" + datefile[
            3:5] + "T" + datefile[11:19]

    # Generate sentence objects
    for sentence in range(beginIndex, endIndex):
        if (len(separateLines[sentence]) == 0):
            currentIndex += 1  # empty line as extra index
            continue
        if "---" in separateLines[sentence]:
            currentIndex += 4
            continue

        sentences.append(
            Sentence(currentIndex, separateLines[sentence], number))
        currentIndex = sentences[-1].endIndex + 1
        number += 1

    document = Document(subject, journal, documentNo, additionalInformation,
                        newDate)
    return (document, sentences)